Skip to content

Instantly share code, notes, and snippets.

@mpg
Created March 10, 2009 21:05
Show Gist options
  • Save mpg/77135 to your computer and use it in GitHub Desktop.
Save mpg/77135 to your computer and use it in GitHub Desktop.
A simple UTF-8 validator in (tex)lua
#!/usr/bin/env texlua
-- A simple UTF-8 validator in Lua. (Tested only with texlua.)
-- Manuel Pégourié-Gonnard, 2009, WTFPL v2.
-- returns true if s is a valid utf-8 sequence according to rfc3629
function is_valid_utf8(str)
local len = string.len(str)
local not_cont = function(b) return b == nil or b < 128 or b >= 192 end
local i = 0
local next_byte = function()
i = i + 1
return string.byte(str, i)
end
while i < len do
local seq = {}
seq[1] = next_byte()
if seq[1] >= 245 then
return false, 'Illegal byte '..seq[1]..' at byte '..i
end
if seq[1] >= 128 then
local offset -- non-coding bits of the 1st byte
for l, threshold in ipairs{[2] = 192, 224, 240} do
if seq[1] >= threshold then -- >= l byte sequence
seq[l] = next_byte()
if not_cont(seq[l]) then
return false, 'Illegal continuation byte '..
seq[l]..' at byte '..i
end
offset = threshold
end
end
if offset == nil then
return false, 'Illegal first byte '..seq[1]..' at byte '..i
end
-- compute the code point for some verifications
local code_point = seq[1] - offset
for j = 2, #seq do
code_point = code_point * 64 + seq[j] - 128
end
local n -- nominal length of the bytes sequence
if code_point <= 0x00007F then n = 1
elseif code_point <= 0x0007FF then n = 2
elseif code_point <= 0x00FFFF then n = 3
elseif code_point <= 0x10FFFF then n = 4
end
if n == nil then
return false,
'Code point '..code_point..' too large at byte '..i
end
if n ~= #seq then
return false, 'Overlong encoding at byte '..i
end
if code_point >= 0xD800 and code_point <= 0xDFFF then
return false, 'Code point '..code_point..
' reserved for utf-16 surrogate pairs at byte '..i
end
end -- if seq[0] >= 128
end
return true
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment