Skip to content

Instantly share code, notes, and snippets.

@paniq
Created May 21, 2019 17:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save paniq/a27d5831231857be75e4f3b4f64a54d2 to your computer and use it in GitHub Desktop.
Save paniq/a27d5831231857be75e4f3b4f64a54d2 to your computer and use it in GitHub Desktop.
using import enum
enum UTF8
# an invalid state was encountered
Invalid : i8
# a codepoint has been read successfully
Codepoint : u32
# declare i8 @llvm.ctlz.i8 (i8 <src>, i1 <is_zero_undef>)
let llvm.ctlz.i8 =
extern 'llvm.ctlz.i8
function i8 i8 bool
inline ctlz (c)
llvm.ctlz.i8 c false
inline decoder (coll)
""""Decode a i8 character stream encoded as UTF-8 as UTF8 enum value
inline _decoder (coll)
let init full? done push = ((coll as Collector))
Collector
inline ()
# which byte we expect and the codepoint we are building
_ 0:i8 0:u32 (init)
inline (b cp state...)
full? state...
inline (b cp state...)
done state...
inline (src b cp state...)
let c = (imply (src) i8)
# full state: expected byte (bits 4-5) and leading bits (bits 0-3)
let st = (b | (ctlz (~ c)))
switch st
# expecting new codepoint, 1 byte header
case 0b000000:i8
# 7 bits, reset
return 0b000000:i8 0:u32
push (inline () (UTF8.Codepoint c)) state...
# expecting new codepoint, 2 byte header
case 0b000010:i8
# 11 bits; start with bits 6-10, expect byte 1
return 0b010000:i8 ((c & 0b11111:i8) as u32) state...
# expecting new codepoint, 3 byte header
case 0b000011:i8
# 16 bits; start with bits 12-15, expect byte 2
return 0b100000:i8 ((c & 0b1111:i8) as u32) state...
# expecting new codepoint, 4 byte header
case 0b000100:i8
# 21 bits; start with bits 18-20, expect byte 3
return 0b110000:i8 ((c & 0b111:i8) as u32) state...
# expecting byte 3, cont header
# expecting byte 2, cont header
pass 0b110001:i8
case 0b100001:i8
# read 6 bits, count down by 1
return (b - 0b10000:i8)
(cp << 6:u32) | ((c & 0b111111:i8) as u32)
state...
# expecting byte 1, cont header
case 0b010001:i8
# read 6 bits, complete & reset
let cp = ((cp << 6:u32) | ((c & 0b111111:i8) as u32))
return 0b000000:i8 0:u32
push (inline () (UTF8.Codepoint cp)) state...
# illegal
default
# reset
return 0b000000:i8 0:u32
push (inline () (UTF8.Invalid c)) state...
static-if (none? coll) _decoder
else (_decoder coll)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment