zacque0/encoder.sml

## encoder.sml
(* Algorithm based on http://www.herongyang.com/Unicode/UTF-8-UTF-8-Encoding-Algorithm.html *)
exception EncodeError

(* Take a code point as input, and encode it into a list of bytes using UTF-8. *)
fun encodeUTF8 (codepoint : int) : Word.word list =
    let val andb = Word.andb
        val orb = Word.orb
        val >> = Word.>>
        val fromInt = Word.fromInt
        infix andb orb >>
        val cpb = fromInt codepoint

        (* Convert shift constants into type Word8.word *)
        val b6 = fromInt 6
        val b12 = fromInt 12
        val b18 = fromInt 18

        (* Pre-computed constants into byte type. *)
        val b7F = fromInt 0x7F
        val b1F = fromInt 0x1F
        val bC0 = fromInt 0xC0
        val b3F = fromInt 0x3F
        val b80 = fromInt 0x80
        val b0F = fromInt 0x0F
        val bE0 = fromInt 0xE0
        val b07 = fromInt 0x07
        val bF0 = fromInt 0xF0
    in if codepoint < 0x80
       then let val b1 = cpb andb b7F
            in [b1]
            end
       else if codepoint < 0x0800
       then let val b1 = (cpb >> b6) andb b1F orb bC0
                val b2 = cpb andb b3F orb b80
            in [b1, b2]
            end
       else if codepoint < 0x010000
       then let val b1 = (cpb >> b12) andb b0F orb bE0
                val b2 = (cpb >> b6) andb b3F orb b80
                val b3 = cpb andb b3F orb b80
            in [b1, b2, b3]
            end
       else if codepoint < 0x110000
       then let val b1 = (cpb >> b18) andb b07 orb bF0
                val b2 = (cpb >> b12) andb b3F orb b80
                val b3 = (cpb >> b6) andb b3F orb b80
                val b4 = cpb andb b3F orb b80
            in [b1, b2, b3, b4]
            end
       else
           raise EncodeError
    end

(* Manually tested against these
- Test input -> output.
  1. 0x0 -> 0x0
  2. 0x50 -> 0x50
  3. 0x7F -> 0x7F
     - one byte
  4. 0x80 -> 0xC2 80
  5. 0x506 -> 0xD4 86
  6. 0x7FF -> 0xDF BF
     - two bytes
  7. 0x800 -> 0xE0 A0 80
  8. 0x5555 -> 0xE5 95 95
  9. 0x0FFFF -> 0xEF BF BF
     - three bytes
  10. 0x010000 -> 0xF0 90 80 80
  11. 0x0FFFFF -> 0xF3 BF BF BF
  12. 0x10FFFF -> 0xF4 8F BF BF
      - four bytes
  13. 0x110000
      - error!

- Test against: https://unicode.scarfboy.com/
  - There is an encoding row for "URL-encoded UTF8".

*)

## multicodepoints.sml
(* Convert a list of code points and returns a list of bytes *)
fun encodeUTF8s (codepoints : int list) : Word.word list =
    let fun enchelper [] xs = xs
	  | enchelper (cp::cps) xs = enchelper cps (xs @ encodeUTF8 cp)
    in enchelper codepoints []
    end

(* Simple test in REPL

> encodeUTF8s [0x50, 0x506, 0x5555, 0xFFFFF];
val it =
   [0wx50, 0wxD4, 0wx86, 0wxE5, 0wx95, 0wx95, 0wxF3, 0wxBF, 0wxBF, 0wxBF]:
   word list

*)
	(* Algorithm based on http://www.herongyang.com/Unicode/UTF-8-UTF-8-Encoding-Algorithm.html *)
	exception EncodeError

	(* Take a code point as input, and encode it into a list of bytes using UTF-8. *)
	fun encodeUTF8 (codepoint : int) : Word.word list =
	let val andb = Word.andb
	val orb = Word.orb
	val >> = Word.>>
	val fromInt = Word.fromInt
	infix andb orb >>
	val cpb = fromInt codepoint

	(* Convert shift constants into type Word8.word *)
	val b6 = fromInt 6
	val b12 = fromInt 12
	val b18 = fromInt 18

	(* Pre-computed constants into byte type. *)
	val b7F = fromInt 0x7F
	val b1F = fromInt 0x1F
	val bC0 = fromInt 0xC0
	val b3F = fromInt 0x3F
	val b80 = fromInt 0x80
	val b0F = fromInt 0x0F
	val bE0 = fromInt 0xE0
	val b07 = fromInt 0x07
	val bF0 = fromInt 0xF0
	in if codepoint < 0x80
	then let val b1 = cpb andb b7F
	in [b1]
	end
	else if codepoint < 0x0800
	then let val b1 = (cpb >> b6) andb b1F orb bC0
	val b2 = cpb andb b3F orb b80
	in [b1, b2]
	end
	else if codepoint < 0x010000
	then let val b1 = (cpb >> b12) andb b0F orb bE0
	val b2 = (cpb >> b6) andb b3F orb b80
	val b3 = cpb andb b3F orb b80
	in [b1, b2, b3]
	end
	else if codepoint < 0x110000
	then let val b1 = (cpb >> b18) andb b07 orb bF0
	val b2 = (cpb >> b12) andb b3F orb b80
	val b3 = (cpb >> b6) andb b3F orb b80
	val b4 = cpb andb b3F orb b80
	in [b1, b2, b3, b4]
	end
	else
	raise EncodeError
	end

	(* Manually tested against these
	- Test input -> output.
	1. 0x0 -> 0x0
	2. 0x50 -> 0x50
	3. 0x7F -> 0x7F
	- one byte
	4. 0x80 -> 0xC2 80
	5. 0x506 -> 0xD4 86
	6. 0x7FF -> 0xDF BF
	- two bytes
	7. 0x800 -> 0xE0 A0 80
	8. 0x5555 -> 0xE5 95 95
	9. 0x0FFFF -> 0xEF BF BF
	- three bytes
	10. 0x010000 -> 0xF0 90 80 80
	11. 0x0FFFFF -> 0xF3 BF BF BF
	12. 0x10FFFF -> 0xF4 8F BF BF
	- four bytes
	13. 0x110000
	- error!

	- Test against: https://unicode.scarfboy.com/
	- There is an encoding row for "URL-encoded UTF8".

	*)
	(* Convert a list of code points and returns a list of bytes *)
	fun encodeUTF8s (codepoints : int list) : Word.word list =
	let fun enchelper [] xs = xs
	\| enchelper (cp::cps) xs = enchelper cps (xs @ encodeUTF8 cp)
	in enchelper codepoints []
	end

	(* Simple test in REPL

	> encodeUTF8s [0x50, 0x506, 0x5555, 0xFFFFF];
	val it =
	[0wx50, 0wxD4, 0wx86, 0wxE5, 0wx95, 0wx95, 0wxF3, 0wxBF, 0wxBF, 0wxBF]:
	word list

	*)