Skip to content

Instantly share code, notes, and snippets.

@zeen
Created September 23, 2011 02:10
Show Gist options
  • Save zeen/1236597 to your computer and use it in GitHub Desktop.
Save zeen/1236597 to your computer and use it in GitHub Desktop.
function bin(x)
local s,m={}
for i=0,7 do
m=x%2;x=(x-m)/2;
s[8-i]=m;
end
return table.concat(s);
end
function hex(x) return ("0x%02X"):format(x); end
function shiftr(n, shift)
local pow = 2 ^ shift;
local mod = n % pow;
return (n - mod) / pow;
end
-- return math.floor(n / pow);
function getbits(n, shift, count)
n = shiftr(n, shift);
return n % (2 ^ count);
end
function utf8_char_raw(code)
local char = string.char;
if code >= 0 and code <= 0x7F then
return code;
elseif code >= 0x80 and code <= 0x7FF then
return 0x80 + 0x40 + getbits(code, 6, 5), 0x80 + getbits(code, 0, 6);
elseif code >= 0x800 and code <= 0xFFFF then
return 0x80 + 0x40 + 0x20 + getbits(code, 12, 4), 0x80 + getbits(code, 6, 6), 0x80 + getbits(code, 0, 6);
elseif code >= 0x10000 and code <= 0x10FFFF then
return 0x80 + 0x40 + 0x20 + 0x10 + getbits(code, 18, 3), 0x80 + getbits(code, 12, 6), 0x80 + getbits(code, 6, 6), 0x80 + getbits(code, 0, 6);
-- the below aren't defined by unicode
elseif code >= 0x10000 and code <= 0x1fffff then
return 0x80 + 0x40 + 0x20 + 0x10 + getbits(code, 18, 3), 0x80 + getbits(code, 12, 6), 0x80 + getbits(code, 6, 6), 0x80 + getbits(code, 0, 6);
elseif code >= 0x200000 and code <= 0x3FFFFFF then
return 0x80 + 0x40 + 0x20 + 0x10 + 0x8 + getbits(code, 24, 2), 0x80 + getbits(code, 18, 6), 0x80 + getbits(code, 12, 6), 0x80 + getbits(code, 6, 6), 0x80 + getbits(code, 0, 6);
elseif code >= 0x4000000 and code <= 0x7FFFFFFF then
return 0x80 + 0x40 + 0x20 + 0x10 + 0x8 + 0x4 + getbits(code, 30, 1), 0x80 + getbits(code, 24, 6), 0x80 + getbits(code, 18, 6), 0x80 + getbits(code, 12, 6), 0x80 + getbits(code, 6, 6), 0x80 + getbits(code, 0, 6);
end
end
function utf8_char(code, ...)
local char = string.char;
if ... then
local r = {code, ...};
for i=1,#r do r[i] = utf8_char(r[i]); end
return table.concat(r);
end
if code >= 0 and code <= 0x7F then
return char(code);
elseif code >= 0x80 and code <= 0x7FF then
return char(0x80 + 0x40 + getbits(code, 6, 5), 0x80 + getbits(code, 0, 6));
elseif code >= 0x800 and code <= 0xFFFF then
return char(0x80 + 0x40 + 0x20 + getbits(code, 12, 4), 0x80 + getbits(code, 6, 6), 0x80 + getbits(code, 0, 6));
elseif code >= 0x10000 and code <= 0x10FFFF then
return char(0x80 + 0x40 + 0x20 + 0x10 + getbits(code, 18, 3), 0x80 + getbits(code, 12, 6), 0x80 + getbits(code, 6, 6), 0x80 + getbits(code, 0, 6));
-- the below aren't defined by unicode
elseif code >= 0x10000 and code <= 0x1fffff then
return char(0x80 + 0x40 + 0x20 + 0x10 + getbits(code, 18, 3), 0x80 + getbits(code, 12, 6), 0x80 + getbits(code, 6, 6), 0x80 + getbits(code, 0, 6));
elseif code >= 0x200000 and code <= 0x3FFFFFF then
return char(0x80 + 0x40 + 0x20 + 0x10 + 0x8 + getbits(code, 24, 2), 0x80 + getbits(code, 18, 6), 0x80 + getbits(code, 12, 6), 0x80 + getbits(code, 6, 6), 0x80 + getbits(code, 0, 6));
elseif code >= 0x4000000 and code <= 0x7FFFFFFF then
return char(0x80 + 0x40 + 0x20 + 0x10 + 0x8 + 0x4 + getbits(code, 30, 1), 0x80 + getbits(code, 24, 6), 0x80 + getbits(code, 18, 6), 0x80 + getbits(code, 12, 6), 0x80 + getbits(code, 6, 6), 0x80 + getbits(code, 0, 6));
end
end
function str2hex(s)
return s:gsub(".", function(x) return ("%02X"):format(x:byte()); end):gsub("..", " %1"):sub(2);
end
function str2bin(s)
return s:gsub(".", function(x) return bin(x:byte()); end):gsub("........", " %1"):sub(2);
end
local is_valid_utf8; do
local null = "\0";
local function fix(s) return s:gsub("x(%x%x)", function(x) return string.char(tonumber(x, 16)) end); end
-- Patterns based on http://www.w3.org/International/questions/qa-forms-utf-8.en.php
-- verified in the Unicode specification (see http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf Table 3-7)
local ASCII = fix("[%zx01-x7F]");
local non_overlong_2_byte = fix("[xC2-xDF][x80-xBF]");
local excluding_overlongs = fix("xE0[xA0-xBF][x80-xBF]");
local straight_3_byte = fix("[xE1-xECxEExEF][x80-xBF][x80-xBF]");
local excluding_surrogates = fix("xED[x80-x9F][x80-xBF]");
local planes_1_3 = fix("xF0[x90-xBF][x80-xBF][x80-xBF]");
local planes_4_15 = fix("[xF1-xF3][x80-xBF][x80-xBF][x80-xBF]");
local plane_16 = fix("xF4[x80-x8F][x80-xBF][x80-xBF]");
local non_ASCII = fix("[x80-xFF]");
local non_ASCII_run = fix("[x80-xFF]+");
function is_valid_utf8(s)
for non_ascii in s:gmatch(non_ASCII_run) do
if non_ascii
:gsub(non_overlong_2_byte, null)
:gsub(excluding_overlongs, null)
:gsub(straight_3_byte, null)
:gsub(excluding_surrogates, null)
:gsub(planes_1_3, null)
:gsub(planes_4_15, null)
:gsub(plane_16, null)
:find(non_ASCII)
then return false; end
end
return true;
end
local invalid_xml_single = fix("[^x09x0Ax0Dx20-xFF]");
local invalid_xml_double = fix("xFF[xFExFF]");
function is_valid_xml_char(s)
return not s:find(invalid_xml_single) and not s:find(invalid_xml_double) and is_valid_utf8(s);
end
--print(is_valid_utf8("hello world"));
--print(is_valid_utf8(fix"xFF"));
--print(is_valid_utf8(fix"xFE"));
end
do
collectgarbage() print(collectgarbage('count'));
local function range_equal(a, b)
return a[1] == b[1] and a[2] == b[2];
end
local function range_string(a) return "["..a[1].."-"..a[2].."]"; end
local function ranges_string(a)
local s = "";
for i=1,#a do s = s..range_string(a[i]); end
return s;
end
local function range_merge(a, b)
if a[1] > b[2] then a,b = b,a; end
if a[2]+1 >= b[1] then
return {a[1], b[2]};
end
end
local function merge(a, b)
if #a == #b then
--print("merge:", ranges_string(a), ranges_string(b));
local unequal;
for i=1,#a do
if not range_equal(a[i], b[i]) then
if unequal then return nil; end -- two unequal
unequal = i;
end
end
if not unequal then return a; end -- same
-- just one unequal, do merge
local _c = range_merge(a[unequal], b[unequal]);
if _c then
local c = {};
for i=1,#a do c[i] = a[i]; end
c[unequal] = _c;
--print("yes");
return c;
end
end
end
--print("merge test:", merge({{229,229},{157,157},{155,155}}, {{229,229},{157,157},{156,156}}));
local splits_at = {{9},{13},{32},{194,128},{224,160,128},{225,128,128},{237,128,128},{238,160,128},{239,128,128},{239,191,128},{240,144,128,128},{241,128,128,128},{244,128,128,128}};
local valid = {};
function range(from, to)
for i=from, to do
local length = #valid;
local t = {utf8_char_raw(i)}
for i=1,#t do t[i] = {t[i], t[i]}; end
valid[#valid+1] = t;
while #valid > 1 do
local a, b = valid[#valid-1], valid[#valid];
local c = merge(a, b);
if not c then break; end
valid[#valid-1] = c;
valid[#valid] = nil;
end
local _1,_2,_3,_4 = utf8_char_raw(i);
for j=1,#splits_at do
local a1,a2,a3,a4 = unpack(splits_at[j]);
if _4==a4 and _3==a3 and _2==a2 and _1==a1 then print("up: ", i, str2hex(utf8_char(i))) end
end
--if #valid > length then print("up: ", i, str2hex(utf8_char(i))) end
--if #valid < length then print("down: ", i, str2hex(utf8_char(i))) end
end
end
range(0x9, 0x9);
range(0xA, 0xA);
range(0xD, 0xD);
range(0x20, 0xD7FF);
range(0xE000, 0xFFFD);
range(0x10000, 0x10FFFF);
--range(0x0, 0x7F);
--range(0x80, 0x7FF);
--range(0x800, 0xFFFF);
--range(0x10000, 0x10FFFF);
local char = string.char;
local function range_string_hex(a) return "["..str2hex(char(a[1])).."-"..str2hex(char(a[2])).."]"; end
local function ranges_string_hex(a)
local s = "";
for i=1,#a do s = s..range_string_hex(a[i]); end
return s;
end
local count = 0;
for _,range in ipairs(valid) do
print(ranges_string(range), ranges_string_hex(range));
count = count + 1;
end
print(count); print(collectgarbage('count'));
end--]]
ranges = { automerge = true };
function ranges:add(a, b)
if not b then b = a; end
if a > b then a,b = b,a end
local pos;
for i=1,#self do
local r = self[i];
if a <= r[1] then
pos = i;
break;
end
end
if not pos then pos = #self+1; end
table.insert(self, pos, {a, b});
if self.automerge then
if self:merge(pos - 1) == 0 then self:merge(pos); end
end
return self;
end
function ranges:subtract(a, b)
if not b then b = a; end
if a > b then a,b = b,a end
for i=#self,1,-1 do
local r=self[i];
if r[2] < a then break; end -- deletion range too high
if r[1] <= b then -- some overlap with deletion range
if a > r[1] and b < r[2] then -- deletion makes a hole
table.insert(self, i+1, {b+1, r[2]});
r[2] = a-1;
elseif a <= r[1] and b >= r[2] then -- full overlap
table.remove(self, i);
elseif b < r[2] then -- lower part may get chopped off
r[1] = b+1;
else -- upper part may get chopped off
r[2] = a-1;
end
end
end
return self;
end
function ranges:sort()
table.sort(self, function(a, b) return a[1] < b[1] or (a[1] == b[1] and a[2] < b[2]); end);
return self;
end
function ranges:merge(index)
local count = 0;
if index then
local r1 = self[index];
if not r1 then return; end
local i = index + 1;
while true do
local r2 = self[i];
if not r2 or r2[1] < r1[1] then break; end -- end or unsorted
if not(r1[2]+1 >= r2[1]) then break; end -- merge not possible
r1[2] = math.max(r1[2], r2[2]);
table.remove(self, i);
count = count + 1;
end
else
local i = 1;
while i < #self do
count = count + self:merge(i);
i = i + 1;
end
end
return count;
end
function ranges:__tostring()
local s = {};
for i=1,#self do
local r = self[i];
if r[1] == r[2] then
s[i] = r[1];
else
s[i] = "["..r[1].."-"..r[2].."]";
end
end
return "{ "..table.concat(s, ",").." }";
end
function ranges:split(n)
for i=#self,1,-1 do
local r=self[i];
if r[1] < n and r[2] >= n then -- hit
table.insert(self, i+1, {n, r[2]});
r[2] = n-1;
elseif r[2] < n then
break; -- went too low
end
end
return self;
end
setmetatable(ranges, ranges);
ranges:add(0x9, 0x9);
ranges:add(0xA, 0xA);
ranges:add(0xD, 0xD);
ranges:add(0x20, 0xD7FF);
ranges:add(0xE000, 0xFFFD);
ranges:add(0x10000, 0x10FFFF);
print(ranges);
ranges:split(0x80);
ranges:split(0x800);
ranges:split(0x10000);
ranges:split(0x200000);
ranges:split(0x4000000);
print(ranges);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment