Last active
November 21, 2021 07:34
-
-
Save zacque0/d8c2dfdd3a16836589380fcf0ac3aeb9 to your computer and use it in GitHub Desktop.
Standard ML Simple Unicode Codepoint -> UTF-8 Encoder
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(* Algorithm based on http://www.herongyang.com/Unicode/UTF-8-UTF-8-Encoding-Algorithm.html *) | |
exception EncodeError | |
(* Take a code point as input, and encode it into a list of bytes using UTF-8. *) | |
fun encodeUTF8 (codepoint : int) : Word.word list = | |
let val andb = Word.andb | |
val orb = Word.orb | |
val >> = Word.>> | |
val fromInt = Word.fromInt | |
infix andb orb >> | |
val cpb = fromInt codepoint | |
(* Convert shift constants into type Word8.word *) | |
val b6 = fromInt 6 | |
val b12 = fromInt 12 | |
val b18 = fromInt 18 | |
(* Pre-computed constants into byte type. *) | |
val b7F = fromInt 0x7F | |
val b1F = fromInt 0x1F | |
val bC0 = fromInt 0xC0 | |
val b3F = fromInt 0x3F | |
val b80 = fromInt 0x80 | |
val b0F = fromInt 0x0F | |
val bE0 = fromInt 0xE0 | |
val b07 = fromInt 0x07 | |
val bF0 = fromInt 0xF0 | |
in if codepoint < 0x80 | |
then let val b1 = cpb andb b7F | |
in [b1] | |
end | |
else if codepoint < 0x0800 | |
then let val b1 = (cpb >> b6) andb b1F orb bC0 | |
val b2 = cpb andb b3F orb b80 | |
in [b1, b2] | |
end | |
else if codepoint < 0x010000 | |
then let val b1 = (cpb >> b12) andb b0F orb bE0 | |
val b2 = (cpb >> b6) andb b3F orb b80 | |
val b3 = cpb andb b3F orb b80 | |
in [b1, b2, b3] | |
end | |
else if codepoint < 0x110000 | |
then let val b1 = (cpb >> b18) andb b07 orb bF0 | |
val b2 = (cpb >> b12) andb b3F orb b80 | |
val b3 = (cpb >> b6) andb b3F orb b80 | |
val b4 = cpb andb b3F orb b80 | |
in [b1, b2, b3, b4] | |
end | |
else | |
raise EncodeError | |
end | |
(* Manually tested against these | |
- Test input -> output. | |
1. 0x0 -> 0x0 | |
2. 0x50 -> 0x50 | |
3. 0x7F -> 0x7F | |
- one byte | |
4. 0x80 -> 0xC2 80 | |
5. 0x506 -> 0xD4 86 | |
6. 0x7FF -> 0xDF BF | |
- two bytes | |
7. 0x800 -> 0xE0 A0 80 | |
8. 0x5555 -> 0xE5 95 95 | |
9. 0x0FFFF -> 0xEF BF BF | |
- three bytes | |
10. 0x010000 -> 0xF0 90 80 80 | |
11. 0x0FFFFF -> 0xF3 BF BF BF | |
12. 0x10FFFF -> 0xF4 8F BF BF | |
- four bytes | |
13. 0x110000 | |
- error! | |
- Test against: https://unicode.scarfboy.com/ | |
- There is an encoding row for "URL-encoded UTF8". | |
*) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(* Convert a list of code points and returns a list of bytes *) | |
fun encodeUTF8s (codepoints : int list) : Word.word list = | |
let fun enchelper [] xs = xs | |
| enchelper (cp::cps) xs = enchelper cps (xs @ encodeUTF8 cp) | |
in enchelper codepoints [] | |
end | |
(* Simple test in REPL | |
> encodeUTF8s [0x50, 0x506, 0x5555, 0xFFFFF]; | |
val it = | |
[0wx50, 0wxD4, 0wx86, 0wxE5, 0wx95, 0wx95, 0wxF3, 0wxBF, 0wxBF, 0wxBF]: | |
word list | |
*) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment