Skip to content

Instantly share code, notes, and snippets.

@poizan42
Last active September 28, 2018 13:49
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save poizan42/6e3703cf268108a0c88d to your computer and use it in GitHub Desktop.
Save poizan42/6e3703cf268108a0c88d to your computer and use it in GitHub Desktop.
UTF-8 decoder in Standard ML
exception Encoding of string;
local
fun decodeUtf8Chars nil = nil
| decodeUtf8Chars (c::rest) =
if c < #"\128" then (ord c)::(decodeUtf8Chars rest)
else
let
val cn = ord c
(* 0xF4 is the largest allowed start byte after the restriction
* to codepoints <= 0x10FFFF by RFC 3629*)
val (len,value) = if cn > 0xF4 then
raise Encoding ("Start byte "^Int.toString(cn)^" too large.")
else if cn >= 0xF0 then (4, cn-0xF0)
else if cn >= 0xE0 then (3, cn-0xE0)
else if cn >= 0xC0 then (2, cn-0xC0)
else raise Encoding ("Start byte "^Int.toString(cn)^" too small.")
in
decodeUtf8Seq (rest, len-1, value)
end
and decodeUtf8Seq (rest, 0, value) =
if value > 0x10FFFF then
raise Encoding ("Sequence "^Int.toString(value)^" is too large")
else value::(decodeUtf8Chars rest)
| decodeUtf8Seq (nil, _, _) =
raise Encoding "Unexpected end of string"
| decodeUtf8Seq (c::rest, lenLeft, value) =
let
val cn = ord c
val partValue =
if cn < 0x80 orelse cn >= 0xC0 then
raise Encoding ("Invalid continuation byte: "^Int.toString(cn))
else cn-0x80
in
decodeUtf8Seq (rest, lenLeft-1, value*64 + partValue)
end
in
(* explodeUtf8: string -> list int
* Decodes an utf8 encoded string into a list of unicode codepoints.*)
val explodeUtf8 = decodeUtf8Chars o explode
end;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment