Skip to content

Instantly share code, notes, and snippets.

@PhilippSalvisberg
Last active October 1, 2024 08:54
Show Gist options
  • Save PhilippSalvisberg/520cb8269462f1011c319cbf05b9b12b to your computer and use it in GitHub Desktop.
Save PhilippSalvisberg/520cb8269462f1011c319cbf05b9b12b to your computer and use it in GitHub Desktop.
PL/SQL Package to convert a UTF-8 codepoint to bytes and vice versa
create or replace package utf8 is
function codepoint_to_bytes(in_codepoint in varchar2) return varchar2 deterministic;
function bytes_to_codepoint(in_bytes in varchar2) return varchar2 deterministic;
function codepoint_to_unistr(in_codepoint in varchar2) return varchar2 deterministic;
function unistr_to_codepoint(in_unistr in varchar2) return varchar2 deterministic;
end utf8;
/
create or replace package body utf8 is
function hex_to_num(in_hex in varchar2) return integer deterministic;
function int_to_binary(in_int in integer, in_len in integer default 21) return varchar2 deterministic;
function build_byte(in_prefix in varchar2, in_suffix in varchar2) return varchar2 deterministic;
function bits_from_byte(in_byte in varchar2, in_start_pos in integer default 1) return varchar2 deterministic;
function hex_to_num(in_hex in varchar2) return integer deterministic is
co_hex constant varchar2(8 char) := in_hex;
begin
return to_number(co_hex default 0 on conversion error, rpad('X', length(co_hex), 'X'), q'[nls_numeric_characters='.,']');
end hex_to_num;
function int_to_binary(in_int in integer, in_len in integer default 21) return varchar2 deterministic is
l_result varchar2(21 char);
begin
-- query based on https://sqlpatterns.wordpress.com/2017/05/25/integer-to-binary-conversion-in-oracle-sql/
select listagg(sign(bitand(in_int, power(2, level - 1))), null) within group(order by level desc)
into l_result
from dual
connect by power(2, level - 1) <= in_int;
return lpad(l_result, in_len, '0');
end int_to_binary;
function build_byte(in_prefix in varchar2, in_suffix in varchar2) return varchar2 deterministic is
co_prefix constant varchar2(7 char) := in_prefix;
co_suffix constant varchar2(8 char) := in_suffix;
l_binary varchar2(8 char);
l_result varchar2(4 char);
begin
l_binary := co_prefix || co_suffix;
select to_char(
bin_to_num(
to_number(substr(l_binary, 1, 1) default 0 on conversion error, '9', q'[nls_numeric_characters='.,']'),
to_number(substr(l_binary, 2, 1) default 0 on conversion error, '9', q'[nls_numeric_characters='.,']'),
to_number(substr(l_binary, 3, 1) default 0 on conversion error, '9', q'[nls_numeric_characters='.,']'),
to_number(substr(l_binary, 4, 1) default 0 on conversion error, '9', q'[nls_numeric_characters='.,']'),
to_number(substr(l_binary, 5, 1) default 0 on conversion error, '9', q'[nls_numeric_characters='.,']'),
to_number(substr(l_binary, 6, 1) default 0 on conversion error, '9', q'[nls_numeric_characters='.,']'),
to_number(substr(l_binary, 7, 1) default 0 on conversion error, '9', q'[nls_numeric_characters='.,']'),
to_number(substr(l_binary, 8, 1) default 0 on conversion error, '9', q'[nls_numeric_characters='.,']')
),
'FMXX'
)
into l_result
from dual;
return lpad(l_result, 2, '0');
end build_byte;
function bits_from_byte(in_byte in varchar2, in_start_pos in integer default 1) return varchar2 deterministic is
co_byte constant varchar2(2 char) := in_byte;
l_int integer;
l_binary varchar2(8 char);
begin
l_int := hex_to_num(co_byte);
l_binary := int_to_binary(l_int, 8);
return substr(l_binary, in_start_pos);
end bits_from_byte;
function codepoint_to_bytes(in_codepoint in varchar2) return varchar2 deterministic is
co_codepoint constant varchar2(8 char) := in_codepoint;
l_cp integer;
l_binary varchar2(21 char);
l_return varchar2(8 char);
begin
-- algorithm based on description in https://en.wikipedia.org/wiki/UTF-8#Description
if upper(co_codepoint) like 'U+%' then
l_cp := hex_to_num(substr(co_codepoint, 3));
else
l_cp := hex_to_num(co_codepoint);
end if;
if l_cp <= hex_to_num('007F') then -- one byte
l_binary := int_to_binary(l_cp, 7);
l_return := build_byte('0', l_binary);
elsif l_cp <= hex_to_num('07FF') then -- two bytes
l_binary := int_to_binary(l_cp, 11);
l_return := build_byte('110', substr(l_binary, 1, 5))
|| build_byte('10', substr(l_binary, 6));
elsif l_cp <= hex_to_num('FFFF') then -- three bytes
l_binary := int_to_binary(l_cp, 16);
l_return := build_byte('1110', substr(l_binary, 1, 4))
|| build_byte('10', substr(l_binary, 5, 6))
|| build_byte('10', substr(l_binary, 11));
elsif l_cp <= hex_to_num('10FFFF') then -- four bytes
l_binary := int_to_binary(l_cp, 21);
l_return := build_byte('11110', substr(l_binary, 1, 3))
|| build_byte('10', substr(l_binary, 4, 6))
|| build_byte('10', substr(l_binary, 10, 6))
|| build_byte('10', substr(l_binary, 16));
end if;
return l_return;
end codepoint_to_bytes;
function bytes_to_codepoint(in_bytes in varchar2) return varchar2 deterministic is
co_bytes constant varchar2(8 char) := in_bytes;
l_len integer;
l_bits varchar2(21 char);
l_return varchar2(8 char);
begin
-- algorithm based on description in https://en.wikipedia.org/wiki/UTF-8#Description
l_len := length(co_bytes);
case l_len
when 2 then -- one byte
l_return := 'U+00' || build_byte('0', bits_from_byte(co_bytes, 2));
when 4 then -- two bytes
l_bits := bits_from_byte(substr(co_bytes, 1, 2), 4)
|| bits_from_byte(substr(co_bytes, 3, 2), 3);
l_return := 'U+' || build_byte('00000', substr(l_bits, 1, 3))
|| build_byte(null, substr(l_bits, 4));
when 6 then -- three bytes
l_bits := bits_from_byte(substr(co_bytes, 1, 2), 5)
|| bits_from_byte(substr(co_bytes, 3, 2), 3)
|| bits_from_byte(substr(co_bytes, 5, 2), 3);
l_return := 'U+' || build_byte(null, substr(l_bits, 1, 8))
|| build_byte(null, substr(l_bits, 9));
when 8 then -- four bytes
l_bits := bits_from_byte(substr(co_bytes, 1, 2), 6)
|| bits_from_byte(substr(co_bytes, 3, 2), 3)
|| bits_from_byte(substr(co_bytes, 5, 2), 3)
|| bits_from_byte(substr(co_bytes, 7, 2), 3);
l_return := 'U+' || build_byte('000', substr(l_bits, 1, 5))
|| build_byte(null, substr(l_bits, 6, 8))
|| build_byte(null, substr(l_bits, 14));
end case;
return l_return;
end bytes_to_codepoint;
function codepoint_to_unistr(in_codepoint in varchar2) return varchar2 deterministic is
co_codepoint constant varchar2(8 char) := in_codepoint;
l_cp integer;
l_binary varchar2(20 char);
l_return varchar2(10 char);
begin
-- algorithm based on description in https://en.wikipedia.org/wiki/UTF-16
if upper(co_codepoint) like 'U+%' then
l_cp := hex_to_num(substr(co_codepoint, 3));
else
l_cp := hex_to_num(co_codepoint);
end if;
if l_cp <= hex_to_num('FFFF') then -- basic multilingual plane (BMP)
l_return := '\' || lpad(to_char(l_cp, 'FMXXXX'), 4, '0');
elsif l_cp <= hex_to_num('10FFFF') then -- other plane -> high/low surrogates
l_binary := int_to_binary(l_cp - hex_to_num('10000'), 20);
l_return := '\' || build_byte('110110', substr(l_binary, 1, 2))
|| build_byte(null, substr(l_binary, 3, 8))
|| '\' || build_byte('110111', substr(l_binary, 11, 2))
|| build_byte(null, substr(l_binary, 13, 8));
end if;
return l_return;
end codepoint_to_unistr;
function unistr_to_codepoint(in_unistr in varchar2) return varchar2 deterministic is
co_unistr constant varchar2(10 char) := in_unistr;
l_len integer;
l_bits varchar2(20 char);
l_cp integer;
l_return varchar2(8 char);
begin
-- algorithm based on description in https://en.wikipedia.org/wiki/UTF-16
l_len := length(co_unistr);
case l_len
when 5 then -- basic multilingual plane (BMP)
l_return := 'U+' || upper(substr(co_unistr, 2));
when 10 then -- other plane -> high/low surrogates
l_bits := bits_from_byte(substr(co_unistr, 2, 2), 7)
|| bits_from_byte(substr(co_unistr, 4, 2), 1)
|| bits_from_byte(substr(co_unistr, 7, 2), 7)
|| bits_from_byte(substr(co_unistr, 9, 2), 1);
l_cp := hex_to_num('10000') +
hex_to_num(
build_byte('0000', substr(l_bits, 1, 4))
|| build_byte(null, substr(l_bits, 5, 8))
|| build_byte(null, substr(l_bits, 13, 8))
);
l_return := 'U+' || trim(leading '0' from lpad(to_char(l_cp, 'FMXXXXXX'), 6, '0'));
end case;
return l_return;
end unistr_to_codepoint;
end utf8;
/
-- demo utf8.codepoint_to_bytes
column result_41 format a9
column result_c2a3 format a11
column result_e282ac format a13
column result_e280bc format a13
column result_f09f9880 format a15
select utf8.codepoint_to_bytes('U+0041') as result_41, -- A
utf8.codepoint_to_bytes('U+00A3') as result_c2a3, -- £
utf8.codepoint_to_bytes('U+20AC') as result_e282ac, -- €
utf8.codepoint_to_bytes('U+203C') as result_e280bc, -- ‼
utf8.codepoint_to_bytes('U+1F600') as result_f09f9880 -- 😀
from dual;
-- demo utf8.bytes_to_codepoint
column result_0041 format a11
column result_00A3 format a11
column result_20ac format a11
column result_203c format a11
column result_1f600 format a12
select utf8.bytes_to_codepoint('41') as result_0041, -- A
utf8.bytes_to_codepoint('c2a3') as result_00A3, -- £
utf8.bytes_to_codepoint('e282ac') as result_20AC, -- €
utf8.bytes_to_codepoint('e280bc') as result_203C, -- ‼
utf8.bytes_to_codepoint('f09f9880') as result_1F600 -- 😀
from dual;
-- demo utf8.codepoint_to_unistr (result to be used in unistr function)
column result_0041 format a11
column result_00A3 format a11
column result_20ac format a11
column result_203c format a11
column result_d83d_de00 format a16
select utf8.codepoint_to_unistr('U+0041') as result_0041, -- A
utf8.codepoint_to_unistr('U+00A3') as result_00A3, -- £
utf8.codepoint_to_unistr('U+20AC') as result_20AC, -- €
utf8.codepoint_to_unistr('U+203C') as result_203C, -- ‼
utf8.codepoint_to_unistr('U+1F600') as result_D83D_DE00 -- 😀
from dual;
-- demo utf8.unistr_to_codepoint (input used in unistr function)
column result_0041 format a11
column result_00A3 format a11
column result_20ac format a11
column result_203c format a11
column result_1f600 format a12
select utf8.unistr_to_codepoint('\0041') as result_0041, -- A
utf8.unistr_to_codepoint('\00A3') as result_00A3, -- £
utf8.unistr_to_codepoint('\20AC') as result_20AC, -- €
utf8.unistr_to_codepoint('\203C') as result_203C, -- ‼
utf8.unistr_to_codepoint('\D83D\DE00') as result_1F600 -- 😀
from dual;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment