Last active
September 28, 2018 13:49
-
-
Save poizan42/6e3703cf268108a0c88d to your computer and use it in GitHub Desktop.
UTF-8 decoder in Standard ML
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
exception Encoding of string; | |
local | |
fun decodeUtf8Chars nil = nil | |
| decodeUtf8Chars (c::rest) = | |
if c < #"\128" then (ord c)::(decodeUtf8Chars rest) | |
else | |
let | |
val cn = ord c | |
(* 0xF4 is the largest allowed start byte after the restriction | |
* to codepoints <= 0x10FFFF by RFC 3629*) | |
val (len,value) = if cn > 0xF4 then | |
raise Encoding ("Start byte "^Int.toString(cn)^" too large.") | |
else if cn >= 0xF0 then (4, cn-0xF0) | |
else if cn >= 0xE0 then (3, cn-0xE0) | |
else if cn >= 0xC0 then (2, cn-0xC0) | |
else raise Encoding ("Start byte "^Int.toString(cn)^" too small.") | |
in | |
decodeUtf8Seq (rest, len-1, value) | |
end | |
and decodeUtf8Seq (rest, 0, value) = | |
if value > 0x10FFFF then | |
raise Encoding ("Sequence "^Int.toString(value)^" is too large") | |
else value::(decodeUtf8Chars rest) | |
| decodeUtf8Seq (nil, _, _) = | |
raise Encoding "Unexpected end of string" | |
| decodeUtf8Seq (c::rest, lenLeft, value) = | |
let | |
val cn = ord c | |
val partValue = | |
if cn < 0x80 orelse cn >= 0xC0 then | |
raise Encoding ("Invalid continuation byte: "^Int.toString(cn)) | |
else cn-0x80 | |
in | |
decodeUtf8Seq (rest, lenLeft-1, value*64 + partValue) | |
end | |
in | |
(* explodeUtf8: string -> list int | |
* Decodes an utf8 encoded string into a list of unicode codepoints.*) | |
val explodeUtf8 = decodeUtf8Chars o explode | |
end; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment