Created
March 10, 2009 21:05
-
-
Save mpg/77135 to your computer and use it in GitHub Desktop.
A simple UTF-8 validator in (tex)lua
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env texlua | |
-- A simple UTF-8 validator in Lua. (Tested only with texlua.) | |
-- Manuel Pégourié-Gonnard, 2009, WTFPL v2. | |
-- returns true if s is a valid utf-8 sequence according to rfc3629 | |
function is_valid_utf8(str) | |
local len = string.len(str) | |
local not_cont = function(b) return b == nil or b < 128 or b >= 192 end | |
local i = 0 | |
local next_byte = function() | |
i = i + 1 | |
return string.byte(str, i) | |
end | |
while i < len do | |
local seq = {} | |
seq[1] = next_byte() | |
if seq[1] >= 245 then | |
return false, 'Illegal byte '..seq[1]..' at byte '..i | |
end | |
if seq[1] >= 128 then | |
local offset -- non-coding bits of the 1st byte | |
for l, threshold in ipairs{[2] = 192, 224, 240} do | |
if seq[1] >= threshold then -- >= l byte sequence | |
seq[l] = next_byte() | |
if not_cont(seq[l]) then | |
return false, 'Illegal continuation byte '.. | |
seq[l]..' at byte '..i | |
end | |
offset = threshold | |
end | |
end | |
if offset == nil then | |
return false, 'Illegal first byte '..seq[1]..' at byte '..i | |
end | |
-- compute the code point for some verifications | |
local code_point = seq[1] - offset | |
for j = 2, #seq do | |
code_point = code_point * 64 + seq[j] - 128 | |
end | |
local n -- nominal length of the bytes sequence | |
if code_point <= 0x00007F then n = 1 | |
elseif code_point <= 0x0007FF then n = 2 | |
elseif code_point <= 0x00FFFF then n = 3 | |
elseif code_point <= 0x10FFFF then n = 4 | |
end | |
if n == nil then | |
return false, | |
'Code point '..code_point..' too large at byte '..i | |
end | |
if n ~= #seq then | |
return false, 'Overlong encoding at byte '..i | |
end | |
if code_point >= 0xD800 and code_point <= 0xDFFF then | |
return false, 'Code point '..code_point.. | |
' reserved for utf-16 surrogate pairs at byte '..i | |
end | |
end -- if seq[0] >= 128 | |
end | |
return true | |
end | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment