Skip to content

Instantly share code, notes, and snippets.

@zacque0
Last active November 21, 2021 07:34
Show Gist options
  • Save zacque0/d8c2dfdd3a16836589380fcf0ac3aeb9 to your computer and use it in GitHub Desktop.
Save zacque0/d8c2dfdd3a16836589380fcf0ac3aeb9 to your computer and use it in GitHub Desktop.
Standard ML Simple Unicode Codepoint -> UTF-8 Encoder
(* Algorithm based on http://www.herongyang.com/Unicode/UTF-8-UTF-8-Encoding-Algorithm.html *)
exception EncodeError
(* Take a code point as input, and encode it into a list of bytes using UTF-8. *)
fun encodeUTF8 (codepoint : int) : Word.word list =
let val andb = Word.andb
val orb = Word.orb
val >> = Word.>>
val fromInt = Word.fromInt
infix andb orb >>
val cpb = fromInt codepoint
(* Convert shift constants into type Word8.word *)
val b6 = fromInt 6
val b12 = fromInt 12
val b18 = fromInt 18
(* Pre-computed constants into byte type. *)
val b7F = fromInt 0x7F
val b1F = fromInt 0x1F
val bC0 = fromInt 0xC0
val b3F = fromInt 0x3F
val b80 = fromInt 0x80
val b0F = fromInt 0x0F
val bE0 = fromInt 0xE0
val b07 = fromInt 0x07
val bF0 = fromInt 0xF0
in if codepoint < 0x80
then let val b1 = cpb andb b7F
in [b1]
end
else if codepoint < 0x0800
then let val b1 = (cpb >> b6) andb b1F orb bC0
val b2 = cpb andb b3F orb b80
in [b1, b2]
end
else if codepoint < 0x010000
then let val b1 = (cpb >> b12) andb b0F orb bE0
val b2 = (cpb >> b6) andb b3F orb b80
val b3 = cpb andb b3F orb b80
in [b1, b2, b3]
end
else if codepoint < 0x110000
then let val b1 = (cpb >> b18) andb b07 orb bF0
val b2 = (cpb >> b12) andb b3F orb b80
val b3 = (cpb >> b6) andb b3F orb b80
val b4 = cpb andb b3F orb b80
in [b1, b2, b3, b4]
end
else
raise EncodeError
end
(* Manually tested against these
- Test input -> output.
1. 0x0 -> 0x0
2. 0x50 -> 0x50
3. 0x7F -> 0x7F
- one byte
4. 0x80 -> 0xC2 80
5. 0x506 -> 0xD4 86
6. 0x7FF -> 0xDF BF
- two bytes
7. 0x800 -> 0xE0 A0 80
8. 0x5555 -> 0xE5 95 95
9. 0x0FFFF -> 0xEF BF BF
- three bytes
10. 0x010000 -> 0xF0 90 80 80
11. 0x0FFFFF -> 0xF3 BF BF BF
12. 0x10FFFF -> 0xF4 8F BF BF
- four bytes
13. 0x110000
- error!
- Test against: https://unicode.scarfboy.com/
- There is an encoding row for "URL-encoded UTF8".
*)
(* Convert a list of code points and returns a list of bytes *)
fun encodeUTF8s (codepoints : int list) : Word.word list =
let fun enchelper [] xs = xs
| enchelper (cp::cps) xs = enchelper cps (xs @ encodeUTF8 cp)
in enchelper codepoints []
end
(* Simple test in REPL
> encodeUTF8s [0x50, 0x506, 0x5555, 0xFFFFF];
val it =
[0wx50, 0wxD4, 0wx86, 0wxE5, 0wx95, 0wx95, 0wxF3, 0wxBF, 0wxBF, 0wxBF]:
word list
*)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment