Skip to content

Instantly share code, notes, and snippets.

@g-andrade
Last active April 9, 2022 19:28
Show Gist options
  • Save g-andrade/7b0e267dd2ff83c3b2ec to your computer and use it in GitHub Desktop.
Save g-andrade/7b0e267dd2ff83c3b2ec to your computer and use it in GitHub Desktop.
Decoding CESU-8 in Erlang (into UTF-8).
decode_cesu8(B) ->
decode_cesu8(B, []).
% https://en.wikipedia.org/wiki/UTF-16
%
decode_surrogate_pairs([], Acc) ->
Acc;
decode_surrogate_pairs([Low, High | Rest], Acc) when High >= 16#D800, High =< 16#DBFF, Low >= 16#DC00, Low =< 16#DFFF ->
decode_surrogate_pairs(Rest, [16#10000 + (((High - 16#D800) bsl 10) bor (Low - 16#DC00)) | Acc]);
decode_surrogate_pairs([Other | Rest], Acc) ->
decode_surrogate_pairs(Rest , [Other | Acc]).
% http://www.unicode.org/reports/tr26/
%
decode_cesu8(<<>>, RevAcc) ->
unicode:characters_to_binary(decode_surrogate_pairs(RevAcc, []), utf16);
decode_cesu8(<<2#0:1, X:7, Rest/binary>>, RevAcc) ->
decode_cesu8(Rest, [X | RevAcc]);
decode_cesu8(<<2#110:3, Y:5, 2#10:2, X:6, Rest/binary>>, RevAcc) ->
decode_cesu8(Rest, [(Y bsl 6) bor X | RevAcc]);
decode_cesu8(<<2#1110:4, Z:4, 2#10:2, Y:6, 2#10:2, X:6, Rest/binary>>, RevAcc) ->
decode_cesu8(Rest, [(Z bsl 12) bor (Y bsl 6) bor X | RevAcc]).
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment