Last active
March 11, 2021 01:28
-
-
Save pygy/7154512 to your computer and use it in GitHub Desktop.
UTF-8 encoder/decoder in Lua
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
local | |
function utf8_to_codepoint (stream, i) | |
-- returns the codepoint and the index of the next character in the stream. | |
local msB = string.byte(stream, i) | |
local b2, b1, b0 | |
if msB < 128 then return msB, i + 1 | |
elseif msB < 192 then | |
error("Byte values between 0x80 to 0xBF cannot start a multibyte sequence") | |
elseif msB < 224 then | |
b0 = string.byte(stream, i + 1) | |
assert(b0 > 191) | |
return (msB - 192)*64 + b0, i + 2 | |
elseif msB < 240 then | |
b1, b0 = string.byte(stream, i + 1, i + 2) | |
assert(b0 > 191 and b1 > 191) | |
local res = (msB - 224)*4096 + b1%64*64 + b0%64 | |
if 55296 <= res and res <= 57343 then | |
error("UTF-16 surrogate lead are not valid codepoints") | |
end | |
return res, i + 3 | |
elseif msB < 248 then | |
b2, b1, b0 = string.byte(stream, i + 1, i + 2, 1 + 3) | |
assert(b0 > 191 and b1 > 191 and b2 > 191) | |
local res = (msB - 240)*262144 + b3%64*4096 + b2%64*64 + b1%64 | |
if res < 1114112 then | |
return res, i + 4 | |
end | |
end | |
error("Invalid UTF-8 character") | |
end | |
local | |
function codepoint_to_utf8(c) | |
assert((55296 > c or c > 57343) and c < 1114112, "Bad Unicode code point: "..c..".") | |
if c < 128 then | |
return string.char(c) | |
elseif c < 2048 then | |
return string.char(192 + c/64, 128 + c%64) | |
elseif c < 55296 or 57343 < c and c < 65536 then | |
return string.char(224 + c/4096, 128 + c/64%64, 128 + c%64) | |
elseif c < 1114112 then | |
return string.char(240 + c/262144, 128 + c/4096%64, 128 + c/64%64, 128 + c%64) | |
end | |
end |
There still some bug, i think https://github.com/Stepets/utf8.lua is better choice !
for encoding, decoding: https://github.com/luapower/utf8
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
anyway, after making these changes it works for me. nicer than my old way of doing it :)