-- $Id: utf8.lua 179 2009-04-03 18:10:03Z pasta $ | |
-- | |
-- Provides UTF-8 aware string functions implemented in pure lua: | |
-- * utf8len(s) | |
-- * utf8sub(s, i, j) | |
-- * utf8reverse(s) | |
-- * utf8char(unicode) | |
-- * utf8unicode(s, i, j) | |
-- * utf8gensub(s, sub_len) | |
-- * utf8find(str, regex, init, plain) | |
-- * utf8match(str, regex, init) | |
-- * utf8gmatch(str, regex, all) | |
-- * utf8gsub(str, regex, repl, limit) | |
-- | |
-- If utf8data.lua (containing the lower<->upper case mappings) is loaded, these | |
-- additional functions are available: | |
-- * utf8upper(s) | |
-- * utf8lower(s) | |
-- | |
-- All functions behave as their non UTF-8 aware counterparts with the exception | |
-- that UTF-8 characters are used instead of bytes for all units. | |
--[[ | |
Copyright (c) 2006-2007, Kyle Smith | |
All rights reserved. | |
Contributors: | |
Alimov Stepan | |
Redistribution and use in source and binary forms, with or without | |
modification, are permitted provided that the following conditions are met: | |
* Redistributions of source code must retain the above copyright notice, | |
this list of conditions and the following disclaimer. | |
* Redistributions in binary form must reproduce the above copyright | |
notice, this list of conditions and the following disclaimer in the | |
documentation and/or other materials provided with the distribution. | |
* Neither the name of the author nor the names of its contributors may be | |
used to endorse or promote products derived from this software without | |
specific prior written permission. | |
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE | |
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
--]] | |
-- ABNF from RFC 3629 | |
-- | |
-- UTF8-octets = *( UTF8-char ) | |
-- UTF8-char = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4 | |
-- UTF8-1 = %x00-7F | |
-- UTF8-2 = %xC2-DF UTF8-tail | |
-- UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) / | |
-- %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail ) | |
-- UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) / | |
-- %xF4 %x80-8F 2( UTF8-tail ) | |
-- UTF8-tail = %x80-BF | |
-- | |
local byte = string.byte | |
local char = string.char | |
local dump = string.dump | |
local find = string.find | |
local format = string.format | |
local gmatch = string.gmatch | |
local gsub = string.gsub | |
local len = string.len | |
local lower = string.lower | |
local match = string.match | |
local rep = string.rep | |
local reverse = string.reverse | |
local sub = string.sub | |
local upper = string.upper | |
-- returns the number of bytes used by the UTF-8 character at byte i in s | |
-- also doubles as a UTF-8 character validator | |
local function utf8charbytes (s, i) | |
-- argument defaults | |
i = i or 1 | |
-- argument checking | |
if type(s) ~= "string" then | |
error("bad argument #1 to 'utf8charbytes' (string expected, got ".. type(s).. ")") | |
end | |
if type(i) ~= "number" then | |
error("bad argument #2 to 'utf8charbytes' (number expected, got ".. type(i).. ")") | |
end | |
local c = byte(s, i) | |
-- determine bytes needed for character, based on RFC 3629 | |
-- validate byte 1 | |
if c > 0 and c <= 127 then | |
-- UTF8-1 | |
return 1 | |
elseif c >= 194 and c <= 223 then | |
-- UTF8-2 | |
local c2 = byte(s, i + 1) | |
if not c2 then | |
error("UTF-8 string terminated early") | |
end | |
-- validate byte 2 | |
if c2 < 128 or c2 > 191 then | |
error("Invalid UTF-8 character") | |
end | |
return 2 | |
elseif c >= 224 and c <= 239 then | |
-- UTF8-3 | |
local c2 = byte(s, i + 1) | |
local c3 = byte(s, i + 2) | |
if not c2 or not c3 then | |
error("UTF-8 string terminated early") | |
end | |
-- validate byte 2 | |
if c == 224 and (c2 < 160 or c2 > 191) then | |
error("Invalid UTF-8 character") | |
elseif c == 237 and (c2 < 128 or c2 > 159) then | |
error("Invalid UTF-8 character") | |
elseif c2 < 128 or c2 > 191 then | |
error("Invalid UTF-8 character") | |
end | |
-- validate byte 3 | |
if c3 < 128 or c3 > 191 then | |
error("Invalid UTF-8 character") | |
end | |
return 3 | |
elseif c >= 240 and c <= 244 then | |
-- UTF8-4 | |
local c2 = byte(s, i + 1) | |
local c3 = byte(s, i + 2) | |
local c4 = byte(s, i + 3) | |
if not c2 or not c3 or not c4 then | |
error("UTF-8 string terminated early") | |
end | |
-- validate byte 2 | |
if c == 240 and (c2 < 144 or c2 > 191) then | |
error("Invalid UTF-8 character") | |
elseif c == 244 and (c2 < 128 or c2 > 143) then | |
error("Invalid UTF-8 character") | |
elseif c2 < 128 or c2 > 191 then | |
error("Invalid UTF-8 character") | |
end | |
-- validate byte 3 | |
if c3 < 128 or c3 > 191 then | |
error("Invalid UTF-8 character") | |
end | |
-- validate byte 4 | |
if c4 < 128 or c4 > 191 then | |
error("Invalid UTF-8 character") | |
end | |
return 4 | |
else | |
error("Invalid UTF-8 character") | |
end | |
end | |
-- returns the number of characters in a UTF-8 string | |
local function utf8len (s) | |
-- argument checking | |
if type(s) ~= "string" then | |
for k,v in pairs(s) do print('"',tostring(k),'"',tostring(v),'"') end | |
error("bad argument #1 to 'utf8len' (string expected, got ".. type(s).. ")") | |
end | |
local pos = 1 | |
local bytes = len(s) | |
local len = 0 | |
while pos <= bytes do | |
len = len + 1 | |
pos = pos + utf8charbytes(s, pos) | |
end | |
return len | |
end | |
-- functions identically to string.sub except that i and j are UTF-8 characters | |
-- instead of bytes | |
local function utf8sub (s, i, j) | |
-- argument defaults | |
j = j or -1 | |
local pos = 1 | |
local bytes = len(s) | |
local len = 0 | |
-- only set l if i or j is negative | |
local l = (i >= 0 and j >= 0) or utf8len(s) | |
local startChar = (i >= 0) and i or l + i + 1 | |
local endChar = (j >= 0) and j or l + j + 1 | |
-- can't have start before end! | |
if startChar > endChar then | |
return "" | |
end | |
-- byte offsets to pass to string.sub | |
local startByte,endByte = 1,bytes | |
while pos <= bytes do | |
len = len + 1 | |
if len == startChar then | |
startByte = pos | |
end | |
pos = pos + utf8charbytes(s, pos) | |
if len == endChar then | |
endByte = pos - 1 | |
break | |
end | |
end | |
if startChar > len then startByte = bytes+1 end | |
if endChar < 1 then endByte = 0 end | |
return sub(s, startByte, endByte) | |
end | |
-- replace UTF-8 characters based on a mapping table | |
local function utf8replace (s, mapping) | |
-- argument checking | |
if type(s) ~= "string" then | |
error("bad argument #1 to 'utf8replace' (string expected, got ".. type(s).. ")") | |
end | |
if type(mapping) ~= "table" then | |
error("bad argument #2 to 'utf8replace' (table expected, got ".. type(mapping).. ")") | |
end | |
local pos = 1 | |
local bytes = len(s) | |
local charbytes | |
local newstr = "" | |
while pos <= bytes do | |
charbytes = utf8charbytes(s, pos) | |
local c = sub(s, pos, pos + charbytes - 1) | |
newstr = newstr .. (mapping[c] or c) | |
pos = pos + charbytes | |
end | |
return newstr | |
end | |
-- identical to string.upper except it knows about unicode simple case conversions | |
local function utf8upper (s) | |
return utf8replace(s, utf8_lc_uc) | |
end | |
-- identical to string.lower except it knows about unicode simple case conversions | |
local function utf8lower (s) | |
return utf8replace(s, utf8_uc_lc) | |
end | |
-- identical to string.reverse except that it supports UTF-8 | |
local function utf8reverse (s) | |
-- argument checking | |
if type(s) ~= "string" then | |
error("bad argument #1 to 'utf8reverse' (string expected, got ".. type(s).. ")") | |
end | |
local bytes = len(s) | |
local pos = bytes | |
local charbytes | |
local newstr = "" | |
while pos > 0 do | |
c = byte(s, pos) | |
while c >= 128 and c <= 191 do | |
pos = pos - 1 | |
c = byte(s, pos) | |
end | |
charbytes = utf8charbytes(s, pos) | |
newstr = newstr .. sub(s, pos, pos + charbytes - 1) | |
pos = pos - 1 | |
end | |
return newstr | |
end | |
-- http://en.wikipedia.org/wiki/Utf8 | |
-- http://developer.coronalabs.com/code/utf-8-conversion-utility | |
local function utf8char(unicode) | |
if unicode <= 0x7F then return char(unicode) end | |
if (unicode <= 0x7FF) then | |
local Byte0 = 0xC0 + math.floor(unicode / 0x40); | |
local Byte1 = 0x80 + (unicode % 0x40); | |
return char(Byte0, Byte1); | |
end; | |
if (unicode <= 0xFFFF) then | |
local Byte0 = 0xE0 + math.floor(unicode / 0x1000); | |
local Byte1 = 0x80 + (math.floor(unicode / 0x40) % 0x40); | |
local Byte2 = 0x80 + (unicode % 0x40); | |
return char(Byte0, Byte1, Byte2); | |
end; | |
if (unicode <= 0x10FFFF) then | |
local code = unicode | |
local Byte3= 0x80 + (code % 0x40); | |
code = math.floor(code / 0x40) | |
local Byte2= 0x80 + (code % 0x40); | |
code = math.floor(code / 0x40) | |
local Byte1= 0x80 + (code % 0x40); | |
code = math.floor(code / 0x40) | |
local Byte0= 0xF0 + code; | |
return char(Byte0, Byte1, Byte2, Byte3); | |
end; | |
error 'Unicode cannot be greater than U+10FFFF!' | |
end | |
local shift_6 = 2^6 | |
local shift_12 = 2^12 | |
local shift_18 = 2^18 | |
local utf8unicode | |
utf8unicode = function(str, i, j, byte_pos) | |
i = i or 1 | |
j = j or i | |
if i > j then return end | |
local char,bytes | |
if byte_pos then | |
bytes = utf8charbytes(str,byte_pos) | |
char = sub(str,byte_pos,byte_pos-1+bytes) | |
else | |
char,byte_pos = utf8sub(str,i,i), 0 | |
bytes = #char | |
end | |
local unicode | |
if bytes == 1 then unicode = byte(char) end | |
if bytes == 2 then | |
local byte0,byte1 = byte(char,1,2) | |
local code0,code1 = byte0-0xC0,byte1-0x80 | |
unicode = code0*shift_6 + code1 | |
end | |
if bytes == 3 then | |
local byte0,byte1,byte2 = byte(char,1,3) | |
local code0,code1,code2 = byte0-0xE0,byte1-0x80,byte2-0x80 | |
unicode = code0*shift_12 + code1*shift_6 + code2 | |
end | |
if bytes == 4 then | |
local byte0,byte1,byte2,byte3 = byte(char,1,4) | |
local code0,code1,code2,code3 = byte0-0xF0,byte1-0x80,byte2-0x80,byte3-0x80 | |
unicode = code0*shift_18 + code1*shift_12 + code2*shift_6 + code3 | |
end | |
return unicode,utf8unicode(str, i+1, j, byte_pos+bytes) | |
end | |
-- Returns an iterator which returns the next substring and its byte interval | |
local function utf8gensub(str, sub_len) | |
sub_len = sub_len or 1 | |
local byte_pos = 1 | |
local len = #str | |
return function(skip) | |
if skip then byte_pos = byte_pos + skip end | |
local char_count = 0 | |
local start = byte_pos | |
repeat | |
if byte_pos > len then return end | |
char_count = char_count + 1 | |
local bytes = utf8charbytes(str,byte_pos) | |
byte_pos = byte_pos+bytes | |
until char_count == sub_len | |
local last = byte_pos-1 | |
local sub = sub(str,start,last) | |
return sub, start, last | |
end | |
end | |
local function binsearch(sortedTable, item, comp) | |
local head, tail = 1, #sortedTable | |
local mid = math.floor((head + tail)/2) | |
if not comp then | |
while (tail - head) > 1 do | |
if sortedTable[tonumber(mid)] > item then | |
tail = mid | |
else | |
head = mid | |
end | |
mid = math.floor((head + tail)/2) | |
end | |
else | |
end | |
if sortedTable[tonumber(head)] == item then | |
return true, tonumber(head) | |
elseif sortedTable[tonumber(tail)] == item then | |
return true, tonumber(tail) | |
else | |
return false | |
end | |
end | |
local function classMatchGenerator(class, plain) | |
local codes = {} | |
local ranges = {} | |
local ignore = false | |
local range = false | |
local firstletter = true | |
local unmatch = false | |
local it = utf8gensub(class) | |
local skip | |
for c,bs,be in it do | |
skip = be | |
if not ignore and not plain then | |
if c == "%" then | |
ignore = true | |
elseif c == "-" then | |
table.insert(codes, utf8unicode(c)) | |
range = true | |
elseif c == "^" then | |
if not firstletter then | |
error('!!!') | |
else | |
unmatch = true | |
end | |
elseif c == ']' then | |
break | |
else | |
if not range then | |
table.insert(codes, utf8unicode(c)) | |
else | |
table.remove(codes) -- removing '-' | |
table.insert(ranges, {table.remove(codes), utf8unicode(c)}) | |
range = false | |
end | |
end | |
elseif ignore and not plain then | |
if c == 'a' then -- %a: represents all letters. (ONLY ASCII) | |
table.insert(ranges, {65, 90}) -- A - Z | |
table.insert(ranges, {97, 122}) -- a - z | |
elseif c == 'c' then -- %c: represents all control characters. | |
table.insert(ranges, {0, 31}) | |
table.insert(codes, 127) | |
elseif c == 'd' then -- %d: represents all digits. | |
table.insert(ranges, {48, 57}) -- 0 - 9 | |
elseif c == 'g' then -- %g: represents all printable characters except space. | |
table.insert(ranges, {1, 8}) | |
table.insert(ranges, {14, 31}) | |
table.insert(ranges, {33, 132}) | |
table.insert(ranges, {134, 159}) | |
table.insert(ranges, {161, 5759}) | |
table.insert(ranges, {5761, 8191}) | |
table.insert(ranges, {8203, 8231}) | |
table.insert(ranges, {8234, 8238}) | |
table.insert(ranges, {8240, 8286}) | |
table.insert(ranges, {8288, 12287}) | |
elseif c == 'l' then -- %l: represents all lowercase letters. (ONLY ASCII) | |
table.insert(ranges, {97, 122}) -- a - z | |
elseif c == 'p' then -- %p: represents all punctuation characters. (ONLY ASCII) | |
table.insert(ranges, {33, 47}) | |
table.insert(ranges, {58, 64}) | |
table.insert(ranges, {91, 96}) | |
table.insert(ranges, {123, 126}) | |
elseif c == 's' then -- %s: represents all space characters. | |
table.insert(ranges, {9, 13}) | |
table.insert(codes, 32) | |
table.insert(codes, 133) | |
table.insert(codes, 160) | |
table.insert(codes, 5760) | |
table.insert(ranges, {8192, 8202}) | |
table.insert(codes, 8232) | |
table.insert(codes, 8233) | |
table.insert(codes, 8239) | |
table.insert(codes, 8287) | |
table.insert(codes, 12288) | |
elseif c == 'u' then -- %u: represents all uppercase letters. (ONLY ASCII) | |
table.insert(ranges, {65, 90}) -- A - Z | |
elseif c == 'w' then -- %w: represents all alphanumeric characters. (ONLY ASCII) | |
table.insert(ranges, {48, 57}) -- 0 - 9 | |
table.insert(ranges, {65, 90}) -- A - Z | |
table.insert(ranges, {97, 122}) -- a - z | |
elseif c == 'x' then -- %x: represents all hexadecimal digits. | |
table.insert(ranges, {48, 57}) -- 0 - 9 | |
table.insert(ranges, {65, 70}) -- A - F | |
table.insert(ranges, {97, 102}) -- a - f | |
else | |
if not range then | |
table.insert(codes, utf8unicode(c)) | |
else | |
table.remove(codes) -- removing '-' | |
table.insert(ranges, {table.remove(codes), utf8unicode(c)}) | |
range = false | |
end | |
end | |
ignore = false | |
else | |
if not range then | |
table.insert(codes, utf8unicode(c)) | |
else | |
table.remove(codes) -- removing '-' | |
table.insert(ranges, {table.remove(codes), utf8unicode(c)}) | |
range = false | |
end | |
ignore = false | |
end | |
firstletter = false | |
end | |
table.sort(codes) | |
local function inRanges(charCode) | |
for _,r in ipairs(ranges) do | |
if r[1] <= charCode and charCode <= r[2] then | |
return true | |
end | |
end | |
return false | |
end | |
if not unmatch then | |
return function(charCode) | |
return binsearch(codes, charCode) or inRanges(charCode) | |
end, skip | |
else | |
return function(charCode) | |
return charCode ~= -1 and not (binsearch(codes, charCode) or inRanges(charCode)) | |
end, skip | |
end | |
end | |
-- utf8sub with extra argument, and extra result value | |
local function utf8subWithBytes (s, i, j, sb) | |
-- argument defaults | |
j = j or -1 | |
local pos = sb or 1 | |
local bytes = len(s) | |
local len = 0 | |
-- only set l if i or j is negative | |
local l = (i >= 0 and j >= 0) or utf8len(s) | |
local startChar = (i >= 0) and i or l + i + 1 | |
local endChar = (j >= 0) and j or l + j + 1 | |
-- can't have start before end! | |
if startChar > endChar then | |
return "" | |
end | |
-- byte offsets to pass to string.sub | |
local startByte,endByte = 1,bytes | |
while pos <= bytes do | |
len = len + 1 | |
if len == startChar then | |
startByte = pos | |
end | |
pos = pos + utf8charbytes(s, pos) | |
if len == endChar then | |
endByte = pos - 1 | |
break | |
end | |
end | |
if startChar > len then startByte = bytes+1 end | |
if endChar < 1 then endByte = 0 end | |
return sub(s, startByte, endByte), endByte + 1 | |
end | |
local cache = setmetatable({},{ | |
__mode = 'kv' | |
}) | |
local cachePlain = setmetatable({},{ | |
__mode = 'kv' | |
}) | |
local function matcherGenerator(regex, plain) | |
local matcher = { | |
functions = {}, | |
captures = {} | |
} | |
if not plain then | |
cache[regex] = matcher | |
else | |
cachePlain[regex] = matcher | |
end | |
local function simple(func) | |
return function(cC) | |
if func(cC) then | |
matcher:nextFunc() | |
matcher:nextStr() | |
else | |
matcher:reset() | |
end | |
end | |
end | |
local function star(func) | |
return function(cC) | |
if func(cC) then | |
matcher:fullResetOnNextFunc() | |
matcher:nextStr() | |
else | |
matcher:nextFunc() | |
end | |
end | |
end | |
local function minus(func) | |
return function(cC) | |
if func(cC) then | |
matcher:fullResetOnNextStr() | |
end | |
matcher:nextFunc() | |
end | |
end | |
local function question(func) | |
return function(cC) | |
if func(cC) then | |
matcher:fullResetOnNextFunc() | |
matcher:nextStr() | |
end | |
matcher:nextFunc() | |
end | |
end | |
local function capture(id) | |
return function(cC) | |
local l = matcher.captures[id][2] - matcher.captures[id][1] | |
local captured = utf8sub(matcher.string, matcher.captures[id][1], matcher.captures[id][2]) | |
local check = utf8sub(matcher.string, matcher.str, matcher.str + l) | |
if captured == check then | |
for i = 0, l do | |
matcher:nextStr() | |
end | |
matcher:nextFunc() | |
else | |
matcher:reset() | |
end | |
end | |
end | |
local function captureStart(id) | |
return function(cC) | |
matcher.captures[id][1] = matcher.str | |
matcher:nextFunc() | |
end | |
end | |
local function captureStop(id) | |
return function(cC) | |
matcher.captures[id][2] = matcher.str - 1 | |
matcher:nextFunc() | |
end | |
end | |
local function balancer(str) | |
local sum = 0 | |
local bc, ec = utf8sub(str, 1, 1), utf8sub(str, 2, 2) | |
local skip = len(bc) + len(ec) | |
bc, ec = utf8unicode(bc), utf8unicode(ec) | |
return function(cC) | |
if cC == ec and sum > 0 then | |
sum = sum - 1 | |
if sum == 0 then | |
matcher:nextFunc() | |
end | |
matcher:nextStr() | |
elseif cC == bc then | |
sum = sum + 1 | |
matcher:nextStr() | |
else | |
if sum == 0 or cC == -1 then | |
sum = 0 | |
matcher:reset() | |
else | |
matcher:nextStr() | |
end | |
end | |
end, skip | |
end | |
matcher.functions[1] = function(cC) | |
matcher:fullResetOnNextStr() | |
matcher.seqStart = matcher.str | |
matcher:nextFunc() | |
if (matcher.str > matcher.startStr and matcher.fromStart) or matcher.str >= matcher.stringLen then | |
matcher.stop = true | |
matcher.seqStart = nil | |
end | |
end | |
local lastFunc | |
local ignore = false | |
local skip = nil | |
local it = (function() | |
local gen = utf8gensub(regex) | |
return function() | |
return gen(skip) | |
end | |
end)() | |
local cs = {} | |
for c, bs, be in it do | |
skip = nil | |
if plain then | |
table.insert(matcher.functions, simple(classMatchGenerator(c, plain))) | |
else | |
if ignore then | |
if find('123456789', c, 1, true) then | |
if lastFunc then | |
table.insert(matcher.functions, simple(lastFunc)) | |
lastFunc = nil | |
end | |
table.insert(matcher.functions, capture(tonumber(c))) | |
elseif c == 'b' then | |
if lastFunc then | |
table.insert(matcher.functions, simple(lastFunc)) | |
lastFunc = nil | |
end | |
local b | |
b, skip = balancer(sub(regex, be + 1, be + 9)) | |
table.insert(matcher.functions, b) | |
else | |
lastFunc = classMatchGenerator('%' .. c) | |
end | |
ignore = false | |
else | |
if c == '*' then | |
if lastFunc then | |
table.insert(matcher.functions, star(lastFunc)) | |
lastFunc = nil | |
else | |
error('invalid regex after ' .. sub(regex, 1, bs)) | |
end | |
elseif c == '+' then | |
if lastFunc then | |
table.insert(matcher.functions, simple(lastFunc)) | |
table.insert(matcher.functions, star(lastFunc)) | |
lastFunc = nil | |
else | |
error('invalid regex after ' .. sub(regex, 1, bs)) | |
end | |
elseif c == '-' then | |
if lastFunc then | |
table.insert(matcher.functions, minus(lastFunc)) | |
lastFunc = nil | |
else | |
error('invalid regex after ' .. sub(regex, 1, bs)) | |
end | |
elseif c == '?' then | |
if lastFunc then | |
table.insert(matcher.functions, question(lastFunc)) | |
lastFunc = nil | |
else | |
error('invalid regex after ' .. sub(regex, 1, bs)) | |
end | |
elseif c == '^' then | |
if bs == 1 then | |
matcher.fromStart = true | |
else | |
error('invalid regex after ' .. sub(regex, 1, bs)) | |
end | |
elseif c == '$' then | |
if be == len(regex) then | |
matcher.toEnd = true | |
else | |
error('invalid regex after ' .. sub(regex, 1, bs)) | |
end | |
elseif c == '[' then | |
if lastFunc then | |
table.insert(matcher.functions, simple(lastFunc)) | |
end | |
lastFunc, skip = classMatchGenerator(sub(regex, be + 1)) | |
elseif c == '(' then | |
if lastFunc then | |
table.insert(matcher.functions, simple(lastFunc)) | |
lastFunc = nil | |
end | |
table.insert(matcher.captures, {}) | |
table.insert(cs, #matcher.captures) | |
table.insert(matcher.functions, captureStart(cs[#cs])) | |
if sub(regex, be + 1, be + 1) == ')' then matcher.captures[#matcher.captures].empty = true end | |
elseif c == ')' then | |
if lastFunc then | |
table.insert(matcher.functions, simple(lastFunc)) | |
lastFunc = nil | |
end | |
local cap = table.remove(cs) | |
if not cap then | |
error('invalid capture: "(" missing') | |
end | |
table.insert(matcher.functions, captureStop(cap)) | |
elseif c == '.' then | |
if lastFunc then | |
table.insert(matcher.functions, simple(lastFunc)) | |
end | |
lastFunc = function(cC) return cC ~= -1 end | |
elseif c == '%' then | |
ignore = true | |
else | |
if lastFunc then | |
table.insert(matcher.functions, simple(lastFunc)) | |
end | |
lastFunc = classMatchGenerator(c) | |
end | |
end | |
end | |
end | |
if #cs > 0 then | |
error('invalid capture: ")" missing') | |
end | |
if lastFunc then | |
table.insert(matcher.functions, simple(lastFunc)) | |
end | |
lastFunc = nil | |
ignore = nil | |
table.insert(matcher.functions, function() | |
if matcher.toEnd and matcher.str ~= matcher.stringLen then | |
matcher:reset() | |
else | |
matcher.stop = true | |
end | |
end) | |
matcher.nextFunc = function(self) | |
self.func = self.func + 1 | |
end | |
matcher.nextStr = function(self) | |
self.str = self.str + 1 | |
end | |
matcher.strReset = function(self) | |
local oldReset = self.reset | |
local str = self.str | |
self.reset = function(s) | |
s.str = str | |
s.reset = oldReset | |
end | |
end | |
matcher.fullResetOnNextFunc = function(self) | |
local oldReset = self.reset | |
local func = self.func +1 | |
local str = self.str | |
self.reset = function(s) | |
s.func = func | |
s.str = str | |
s.reset = oldReset | |
end | |
end | |
matcher.fullResetOnNextStr = function(self) | |
local oldReset = self.reset | |
local str = self.str + 1 | |
local func = self.func | |
self.reset = function(s) | |
s.func = func | |
s.str = str | |
s.reset = oldReset | |
end | |
end | |
matcher.process = function(self, str, start) | |
self.func = 1 | |
start = start or 1 | |
self.startStr = (start >= 0) and start or utf8len(str) + start + 1 | |
self.seqStart = self.startStr | |
self.str = self.startStr | |
self.stringLen = utf8len(str) + 1 | |
self.string = str | |
self.stop = false | |
self.reset = function(s) | |
s.func = 1 | |
end | |
local lastPos = self.str | |
local lastByte | |
local char | |
while not self.stop do | |
if self.str < self.stringLen then | |
--[[ if lastPos < self.str then | |
print('last byte', lastByte) | |
char, lastByte = utf8subWithBytes(str, 1, self.str - lastPos - 1, lastByte) | |
char, lastByte = utf8subWithBytes(str, 1, 1, lastByte) | |
lastByte = lastByte - 1 | |
else | |
char, lastByte = utf8subWithBytes(str, self.str, self.str) | |
end | |
lastPos = self.str ]] | |
char = utf8sub(str, self.str,self.str) | |
--print('char', char, utf8unicode(char)) | |
self.functions[self.func](utf8unicode(char)) | |
else | |
self.functions[self.func](-1) | |
end | |
end | |
if self.seqStart then | |
local captures = {} | |
for _,pair in pairs(self.captures) do | |
if pair.empty then | |
table.insert(captures, pair[1]) | |
else | |
table.insert(captures, utf8sub(str, pair[1], pair[2])) | |
end | |
end | |
return self.seqStart, self.str - 1, unpack(captures) | |
end | |
end | |
return matcher | |
end | |
-- string.find | |
local function utf8find(str, regex, init, plain) | |
local matcher = cache[regex] or matcherGenerator(regex, plain) | |
return matcher:process(str, init) | |
end | |
-- string.match | |
local function utf8match(str, regex, init) | |
init = init or 1 | |
local found = {utf8find(str, regex, init)} | |
if found[1] then | |
if found[3] then | |
return unpack(found, 3) | |
end | |
return utf8sub(str, found[1], found[2]) | |
end | |
end | |
-- string.gmatch | |
local function utf8gmatch(str, regex, all) | |
regex = (utf8sub(regex,1,1) ~= '^') and regex or '%' .. regex | |
local lastChar = 1 | |
return function() | |
local found = {utf8find(str, regex, lastChar)} | |
if found[1] then | |
lastChar = found[2] + 1 | |
if found[all and 1 or 3] then | |
return unpack(found, all and 1 or 3) | |
end | |
return utf8sub(str, found[1], found[2]) | |
end | |
end | |
end | |
local function replace(repl, args) | |
local ret = '' | |
if type(repl) == 'string' then | |
local ignore = false | |
local num = 0 | |
for c in utf8gensub(repl) do | |
if not ignore then | |
if c == '%' then | |
ignore = true | |
else | |
ret = ret .. c | |
end | |
else | |
num = tonumber(c) | |
if num then | |
ret = ret .. args[num] | |
else | |
ret = ret .. c | |
end | |
ignore = false | |
end | |
end | |
elseif type(repl) == 'table' then | |
ret = repl[args[1] or args[0]] or '' | |
elseif type(repl) == 'function' then | |
if #args > 0 then | |
ret = repl(unpack(args, 1)) or '' | |
else | |
ret = repl(args[0]) or '' | |
end | |
end | |
return ret | |
end | |
-- string.gsub | |
local function utf8gsub(str, regex, repl, limit) | |
limit = limit or -1 | |
local ret = '' | |
local prevEnd = 1 | |
local it = utf8gmatch(str, regex, true) | |
local found = {it()} | |
local n = 0 | |
while #found > 0 and limit ~= n do | |
local args = {[0] = utf8sub(str, found[1], found[2]), unpack(found, 3)} | |
ret = ret .. utf8sub(str, prevEnd, found[1] - 1) | |
.. replace(repl, args) | |
prevEnd = found[2] + 1 | |
n = n + 1 | |
found = {it()} | |
end | |
return ret .. utf8sub(str, prevEnd), n | |
end | |
local utf8 = {} | |
utf8.len = utf8len | |
utf8.sub = utf8sub | |
utf8.reverse = utf8reverse | |
utf8.char = utf8char | |
utf8.unicode = utf8unicode | |
utf8.gensub = utf8gensub | |
utf8.byte = utf8unicode | |
utf8.find = utf8find | |
utf8.match = utf8match | |
utf8.gmatch = utf8gmatch | |
utf8.gsub = utf8gsub | |
utf8.dump = dump | |
utf8.format = format | |
utf8.lower = lower | |
utf8.upper = upper | |
utf8.rep = rep | |
return utf8 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment