Skip to content

Instantly share code, notes, and snippets.

@antonijn
Last active March 22, 2020 23:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save antonijn/8400302 to your computer and use it in GitHub Desktop.
Save antonijn/8400302 to your computer and use it in GitHub Desktop.
UTF-8 decoder
private static int getch(byte[] buf, ref int idx)
{
byte nxt = buf[idx++];
byte mask;
int remunits;
if ((nxt & 0x80) == 0) {
mask = 0;
remunits = 0;
} else {
byte lead = 0xC0;
mask = 0xE0;
for (remunits = 1; (nxt & mask) != lead; ++remunits) {
lead = mask;
mask >>= 1;
mask |= 0x80;
}
}
int cp = nxt ^ mask;
while (remunits-- > 0) {
cp <<= 6;
cp |= buf[idx++] & 0x3F;
}
return cp;
}
getch:
push rbp
mov rbp, rsp
movsxd r9, dword ptr [rsi]
lea r8d, dword ptr [r9 + 1]
mov dword ptr [rsi], r8d
movzx eax, byte ptr [rdi + r9]
test al, al
jns .LBB0_7
mov cl, al
and cl, -32
cmp cl, -64
jne .LBB0_2
xor al, -32
movzx eax, al
mov r11d, 1
jmp .LBB0_5
.LBB0_2:
mov cl, -32
mov r11d, 1
.align 16, 0x90
.LBB0_3:
mov r10b, cl
shr cl
or cl, -128
inc r11d
mov dl, cl
and dl, al
cmp dl, r10b
jne .LBB0_3
xor cl, al
movzx eax, cl
test r11d, r11d
jle .LBB0_7
.LBB0_5:
movsxd rcx, r8d
add rdi, rcx
lea edx, dword ptr [r9 + 2]
.align 16, 0x90
.LBB0_6:
dec r11d
shl eax, 6
mov dword ptr [rsi], edx
movzx ecx, byte ptr [rdi]
and ecx, 63
or eax, ecx
inc rdi
inc edx
test r11d, r11d
jg .LBB0_6
.LBB0_7:
pop rbp
ret
uint32_t getch(uint8_t buf[], int *idx)
{
uint32_t cp;
uint8_t nxt = buf[(*idx)++], mask;
int remunits;
if ((nxt & 0b10000000) == 0)
{
mask = 0;
remunits = 0;
}
else
{
uint8_t lead = 0b11000000;
mask = 0b11100000;
for (remunits = 1; (nxt & mask) != lead; ++remunits)
{
lead = mask;
mask >>= 1;
mask |= 0b10000000;
}
}
cp = nxt ^ mask;
while (remunits-- > 0)
{
cp <<= 6;
cp |= buf[(*idx)++] & 0b00111111;
}
return cp;
}
std::uint32_t getch(std::uint8_t buf[], int *idx)
{
std::uint8_t nxt = buf[(*idx)++], mask;
int remunits;
if ((nxt & 0b10000000) == 0)
{
mask = 0;
remunits = 0;
}
else
{
std::uint8_t lead = 0b11000000;
mask = 0b11100000;
for (remunits = 1; (nxt & mask) != lead; ++remunits)
{
lead = mask;
mask >>= 1;
mask |= 0b10000000;
}
}
std::uint32_t cp = nxt ^ mask;
while (remunits-- > 0)
{
cp <<= 6;
cp |= buf[(*idx)++] & 0b00111111;
}
return cp;
}
private static uint GetCh(byte[] buf, ref int idx)
{
byte nxt = buf[idx++], mask;
int remunits;
if ((nxt & 0x80) == 0)
{
mask = 0;
remunits = 0;
}
else
{
byte lead = 0xC0;
mask = 0xE0;
for (remunits = 1; (nxt & mask) != lead; ++remunits)
{
lead = mask;
mask >>= 1;
mask |= 0x80;
}
}
int cp = nxt ^ mask;
while (remunits-- > 0)
{
cp <<= 6;
cp |= buf[idx++] & 0x3F;
}
return (uint)cp;
}
; ModuleID = 'main.ll'
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
; Function Attrs: nounwind uwtable
define i32 @getch(i8* nocapture readonly %buf, i32* nocapture %idx) #0 {
%1 = load i32* %idx, align 4
%2 = add i32 %1, 1
store i32 %2, i32* %idx, align 4
%3 = sext i32 %1 to i64
%4 = getelementptr inbounds i8* %buf, i64 %3
%5 = load i8* %4, align 1
%6 = icmp sgt i8 %5, -1
br i1 %6, label %.loopexit.thread11, label %.preheader
.loopexit.thread11: ; preds = %0
%7 = zext i8 %5 to i32
br label %._crit_edge
.preheader: ; preds = %0
%8 = and i8 %5, -32
%9 = icmp eq i8 %8, -64
br i1 %9, label %.loopexit.thread, label %.lr.ph5
.loopexit.thread: ; preds = %.preheader
%10 = xor i8 %5, -32
%11 = zext i8 %10 to i32
br label %.lr.ph.preheader
.lr.ph5: ; preds = %.preheader, %.lr.ph5
%remunits.04 = phi i32 [ %14, %.lr.ph5 ], [ 1, %.preheader ]
%mask.03 = phi i8 [ %13, %.lr.ph5 ], [ -32, %.preheader ]
%12 = lshr i8 %mask.03, 1
%13 = or i8 %12, -128
%14 = add nsw i32 %remunits.04, 1
%15 = and i8 %13, %5
%16 = icmp eq i8 %15, %mask.03
br i1 %16, label %.loopexit, label %.lr.ph5
.loopexit: ; preds = %.lr.ph5
%17 = xor i8 %13, %5
%18 = zext i8 %17 to i32
%19 = icmp sgt i32 %remunits.04, -1
br i1 %19, label %.lr.ph.preheader, label %._crit_edge
.lr.ph.preheader: ; preds = %.loopexit.thread, %.loopexit
%remunits.22.ph = phi i32 [ %14, %.loopexit ], [ 1, %.loopexit.thread ]
%cp.01.ph = phi i32 [ %18, %.loopexit ], [ %11, %.loopexit.thread ]
%20 = sext i32 %2 to i64
br label %.lr.ph
.lr.ph: ; preds = %.lr.ph.preheader, %.lr.ph
%indvars.iv = phi i64 [ %20, %.lr.ph.preheader ], [ %indvars.iv.next, %.lr.ph ]
%21 = phi i32 [ %2, %.lr.ph.preheader ], [ %24, %.lr.ph ]
%remunits.22 = phi i32 [ %remunits.22.ph, %.lr.ph.preheader ], [ %22, %.lr.ph ]
%cp.01 = phi i32 [ %cp.01.ph, %.lr.ph.preheader ], [ %29, %.lr.ph ]
%22 = add nsw i32 %remunits.22, -1
%23 = shl i32 %cp.01, 6
%24 = add nsw i32 %21, 1
store i32 %24, i32* %idx, align 4
%25 = getelementptr inbounds i8* %buf, i64 %indvars.iv
%26 = load i8* %25, align 1
%27 = zext i8 %26 to i32
%28 = and i32 %27, 63
%29 = or i32 %28, %23
%30 = icmp sgt i32 %22, 0
%indvars.iv.next = add nsw i64 %indvars.iv, 1
br i1 %30, label %.lr.ph, label %._crit_edge
._crit_edge: ; preds = %.lr.ph, %.loopexit, %.loopexit.thread11
%cp.0.lcssa = phi i32 [ %18, %.loopexit ], [ %7, %.loopexit.thread11 ], [ %29, %.lr.ph ]
ret i32 %cp.0.lcssa
}
attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
!llvm.ident = !{!0}
!0 = metadata !{metadata !"clang version 3.4 (tags/RELEASE_34/final)"}
@limpkin
Copy link

limpkin commented Jan 20, 2020

https://gist.github.com/antonijn/8400302#file-utf8-aqua-L18 shouldn't it be lead and not mask?
Imagine a 2 bytes utf8: 0xC0 xx: it won't go through your for() loop, so mask will remain 0xE0, so the XOR result will be 0x2x no?

@limpkin
Copy link

limpkin commented Jan 20, 2020

that code will also loop indefinitely for a first byte value of 0b10xxxxx

@antonijn
Copy link
Author

that code will also loop indefinitely for a first byte value of 0b10xxxxx

Thanks for the comments/corrections, but I wrote this code when I was 15. Correctness wasn't my top priority at the time apparently :P

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment