Last active
March 22, 2020 23:04
-
-
Save antonijn/8400302 to your computer and use it in GitHub Desktop.
UTF-8 decoder
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
private static int getch(byte[] buf, ref int idx) | |
{ | |
byte nxt = buf[idx++]; | |
byte mask; | |
int remunits; | |
if ((nxt & 0x80) == 0) { | |
mask = 0; | |
remunits = 0; | |
} else { | |
byte lead = 0xC0; | |
mask = 0xE0; | |
for (remunits = 1; (nxt & mask) != lead; ++remunits) { | |
lead = mask; | |
mask >>= 1; | |
mask |= 0x80; | |
} | |
} | |
int cp = nxt ^ mask; | |
while (remunits-- > 0) { | |
cp <<= 6; | |
cp |= buf[idx++] & 0x3F; | |
} | |
return cp; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
getch: | |
push rbp | |
mov rbp, rsp | |
movsxd r9, dword ptr [rsi] | |
lea r8d, dword ptr [r9 + 1] | |
mov dword ptr [rsi], r8d | |
movzx eax, byte ptr [rdi + r9] | |
test al, al | |
jns .LBB0_7 | |
mov cl, al | |
and cl, -32 | |
cmp cl, -64 | |
jne .LBB0_2 | |
xor al, -32 | |
movzx eax, al | |
mov r11d, 1 | |
jmp .LBB0_5 | |
.LBB0_2: | |
mov cl, -32 | |
mov r11d, 1 | |
.align 16, 0x90 | |
.LBB0_3: | |
mov r10b, cl | |
shr cl | |
or cl, -128 | |
inc r11d | |
mov dl, cl | |
and dl, al | |
cmp dl, r10b | |
jne .LBB0_3 | |
xor cl, al | |
movzx eax, cl | |
test r11d, r11d | |
jle .LBB0_7 | |
.LBB0_5: | |
movsxd rcx, r8d | |
add rdi, rcx | |
lea edx, dword ptr [r9 + 2] | |
.align 16, 0x90 | |
.LBB0_6: | |
dec r11d | |
shl eax, 6 | |
mov dword ptr [rsi], edx | |
movzx ecx, byte ptr [rdi] | |
and ecx, 63 | |
or eax, ecx | |
inc rdi | |
inc edx | |
test r11d, r11d | |
jg .LBB0_6 | |
.LBB0_7: | |
pop rbp | |
ret |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
uint32_t getch(uint8_t buf[], int *idx) | |
{ | |
uint32_t cp; | |
uint8_t nxt = buf[(*idx)++], mask; | |
int remunits; | |
if ((nxt & 0b10000000) == 0) | |
{ | |
mask = 0; | |
remunits = 0; | |
} | |
else | |
{ | |
uint8_t lead = 0b11000000; | |
mask = 0b11100000; | |
for (remunits = 1; (nxt & mask) != lead; ++remunits) | |
{ | |
lead = mask; | |
mask >>= 1; | |
mask |= 0b10000000; | |
} | |
} | |
cp = nxt ^ mask; | |
while (remunits-- > 0) | |
{ | |
cp <<= 6; | |
cp |= buf[(*idx)++] & 0b00111111; | |
} | |
return cp; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
std::uint32_t getch(std::uint8_t buf[], int *idx) | |
{ | |
std::uint8_t nxt = buf[(*idx)++], mask; | |
int remunits; | |
if ((nxt & 0b10000000) == 0) | |
{ | |
mask = 0; | |
remunits = 0; | |
} | |
else | |
{ | |
std::uint8_t lead = 0b11000000; | |
mask = 0b11100000; | |
for (remunits = 1; (nxt & mask) != lead; ++remunits) | |
{ | |
lead = mask; | |
mask >>= 1; | |
mask |= 0b10000000; | |
} | |
} | |
std::uint32_t cp = nxt ^ mask; | |
while (remunits-- > 0) | |
{ | |
cp <<= 6; | |
cp |= buf[(*idx)++] & 0b00111111; | |
} | |
return cp; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
private static uint GetCh(byte[] buf, ref int idx) | |
{ | |
byte nxt = buf[idx++], mask; | |
int remunits; | |
if ((nxt & 0x80) == 0) | |
{ | |
mask = 0; | |
remunits = 0; | |
} | |
else | |
{ | |
byte lead = 0xC0; | |
mask = 0xE0; | |
for (remunits = 1; (nxt & mask) != lead; ++remunits) | |
{ | |
lead = mask; | |
mask >>= 1; | |
mask |= 0x80; | |
} | |
} | |
int cp = nxt ^ mask; | |
while (remunits-- > 0) | |
{ | |
cp <<= 6; | |
cp |= buf[idx++] & 0x3F; | |
} | |
return (uint)cp; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
; ModuleID = 'main.ll' | |
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" | |
target triple = "x86_64-unknown-linux-gnu" | |
; Function Attrs: nounwind uwtable | |
define i32 @getch(i8* nocapture readonly %buf, i32* nocapture %idx) #0 { | |
%1 = load i32* %idx, align 4 | |
%2 = add i32 %1, 1 | |
store i32 %2, i32* %idx, align 4 | |
%3 = sext i32 %1 to i64 | |
%4 = getelementptr inbounds i8* %buf, i64 %3 | |
%5 = load i8* %4, align 1 | |
%6 = icmp sgt i8 %5, -1 | |
br i1 %6, label %.loopexit.thread11, label %.preheader | |
.loopexit.thread11: ; preds = %0 | |
%7 = zext i8 %5 to i32 | |
br label %._crit_edge | |
.preheader: ; preds = %0 | |
%8 = and i8 %5, -32 | |
%9 = icmp eq i8 %8, -64 | |
br i1 %9, label %.loopexit.thread, label %.lr.ph5 | |
.loopexit.thread: ; preds = %.preheader | |
%10 = xor i8 %5, -32 | |
%11 = zext i8 %10 to i32 | |
br label %.lr.ph.preheader | |
.lr.ph5: ; preds = %.preheader, %.lr.ph5 | |
%remunits.04 = phi i32 [ %14, %.lr.ph5 ], [ 1, %.preheader ] | |
%mask.03 = phi i8 [ %13, %.lr.ph5 ], [ -32, %.preheader ] | |
%12 = lshr i8 %mask.03, 1 | |
%13 = or i8 %12, -128 | |
%14 = add nsw i32 %remunits.04, 1 | |
%15 = and i8 %13, %5 | |
%16 = icmp eq i8 %15, %mask.03 | |
br i1 %16, label %.loopexit, label %.lr.ph5 | |
.loopexit: ; preds = %.lr.ph5 | |
%17 = xor i8 %13, %5 | |
%18 = zext i8 %17 to i32 | |
%19 = icmp sgt i32 %remunits.04, -1 | |
br i1 %19, label %.lr.ph.preheader, label %._crit_edge | |
.lr.ph.preheader: ; preds = %.loopexit.thread, %.loopexit | |
%remunits.22.ph = phi i32 [ %14, %.loopexit ], [ 1, %.loopexit.thread ] | |
%cp.01.ph = phi i32 [ %18, %.loopexit ], [ %11, %.loopexit.thread ] | |
%20 = sext i32 %2 to i64 | |
br label %.lr.ph | |
.lr.ph: ; preds = %.lr.ph.preheader, %.lr.ph | |
%indvars.iv = phi i64 [ %20, %.lr.ph.preheader ], [ %indvars.iv.next, %.lr.ph ] | |
%21 = phi i32 [ %2, %.lr.ph.preheader ], [ %24, %.lr.ph ] | |
%remunits.22 = phi i32 [ %remunits.22.ph, %.lr.ph.preheader ], [ %22, %.lr.ph ] | |
%cp.01 = phi i32 [ %cp.01.ph, %.lr.ph.preheader ], [ %29, %.lr.ph ] | |
%22 = add nsw i32 %remunits.22, -1 | |
%23 = shl i32 %cp.01, 6 | |
%24 = add nsw i32 %21, 1 | |
store i32 %24, i32* %idx, align 4 | |
%25 = getelementptr inbounds i8* %buf, i64 %indvars.iv | |
%26 = load i8* %25, align 1 | |
%27 = zext i8 %26 to i32 | |
%28 = and i32 %27, 63 | |
%29 = or i32 %28, %23 | |
%30 = icmp sgt i32 %22, 0 | |
%indvars.iv.next = add nsw i64 %indvars.iv, 1 | |
br i1 %30, label %.lr.ph, label %._crit_edge | |
._crit_edge: ; preds = %.lr.ph, %.loopexit, %.loopexit.thread11 | |
%cp.0.lcssa = phi i32 [ %18, %.loopexit ], [ %7, %.loopexit.thread11 ], [ %29, %.lr.ph ] | |
ret i32 %cp.0.lcssa | |
} | |
attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } | |
!llvm.ident = !{!0} | |
!0 = metadata !{metadata !"clang version 3.4 (tags/RELEASE_34/final)"} |
that code will also loop indefinitely for a first byte value of 0b10xxxxx
that code will also loop indefinitely for a first byte value of 0b10xxxxx
Thanks for the comments/corrections, but I wrote this code when I was 15. Correctness wasn't my top priority at the time apparently :P
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
https://gist.github.com/antonijn/8400302#file-utf8-aqua-L18 shouldn't it be lead and not mask?
Imagine a 2 bytes utf8: 0xC0 xx: it won't go through your for() loop, so mask will remain 0xE0, so the XOR result will be 0x2x no?