Created
May 21, 2019 17:29
-
-
Save paniq/a27d5831231857be75e4f3b4f64a54d2 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using import enum | |
enum UTF8 | |
# an invalid state was encountered | |
Invalid : i8 | |
# a codepoint has been read successfully | |
Codepoint : u32 | |
# declare i8 @llvm.ctlz.i8 (i8 <src>, i1 <is_zero_undef>) | |
let llvm.ctlz.i8 = | |
extern 'llvm.ctlz.i8 | |
function i8 i8 bool | |
inline ctlz (c) | |
llvm.ctlz.i8 c false | |
inline decoder (coll) | |
""""Decode a i8 character stream encoded as UTF-8 as UTF8 enum value | |
inline _decoder (coll) | |
let init full? done push = ((coll as Collector)) | |
Collector | |
inline () | |
# which byte we expect and the codepoint we are building | |
_ 0:i8 0:u32 (init) | |
inline (b cp state...) | |
full? state... | |
inline (b cp state...) | |
done state... | |
inline (src b cp state...) | |
let c = (imply (src) i8) | |
# full state: expected byte (bits 4-5) and leading bits (bits 0-3) | |
let st = (b | (ctlz (~ c))) | |
switch st | |
# expecting new codepoint, 1 byte header | |
case 0b000000:i8 | |
# 7 bits, reset | |
return 0b000000:i8 0:u32 | |
push (inline () (UTF8.Codepoint c)) state... | |
# expecting new codepoint, 2 byte header | |
case 0b000010:i8 | |
# 11 bits; start with bits 6-10, expect byte 1 | |
return 0b010000:i8 ((c & 0b11111:i8) as u32) state... | |
# expecting new codepoint, 3 byte header | |
case 0b000011:i8 | |
# 16 bits; start with bits 12-15, expect byte 2 | |
return 0b100000:i8 ((c & 0b1111:i8) as u32) state... | |
# expecting new codepoint, 4 byte header | |
case 0b000100:i8 | |
# 21 bits; start with bits 18-20, expect byte 3 | |
return 0b110000:i8 ((c & 0b111:i8) as u32) state... | |
# expecting byte 3, cont header | |
# expecting byte 2, cont header | |
pass 0b110001:i8 | |
case 0b100001:i8 | |
# read 6 bits, count down by 1 | |
return (b - 0b10000:i8) | |
(cp << 6:u32) | ((c & 0b111111:i8) as u32) | |
state... | |
# expecting byte 1, cont header | |
case 0b010001:i8 | |
# read 6 bits, complete & reset | |
let cp = ((cp << 6:u32) | ((c & 0b111111:i8) as u32)) | |
return 0b000000:i8 0:u32 | |
push (inline () (UTF8.Codepoint cp)) state... | |
# illegal | |
default | |
# reset | |
return 0b000000:i8 0:u32 | |
push (inline () (UTF8.Invalid c)) state... | |
static-if (none? coll) _decoder | |
else (_decoder coll) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment