Skip to content

Instantly share code, notes, and snippets.

@hinrik
Forked from daurnimator/lpeg_unicode.lua
Created June 18, 2012 18:18
Show Gist options
  • Save hinrik/2949801 to your computer and use it in GitHub Desktop.
Save hinrik/2949801 to your computer and use it in GitHub Desktop.
UTF-8 character classes for LPeG, using ICU4Lua
local lpeg = require 'lpeg'
local U = require 'icu.ustring'
local re = require 'icu.regex'
local utf8_codepoint
do
-- decode a two-byte UTF-8 sequence
local function f2 (s)
local c1, c2 = string.byte(s, 1, 2)
return c1 * 64 + c2 - 12416
end
-- decode a three-byte UTF-8 sequence
local function f3 (s)
local c1, c2, c3 = string.byte(s, 1, 3)
return (c1 * 64 + c2) * 64 + c3 - 925824
end
-- decode a four-byte UTF-8 sequence
local function f4 (s)
local c1, c2, c3, c4 = string.byte(s, 1, 4)
return ((c1 * 64 + c2) * 64 + c3) * 64 + c4 - 63447168
end
local cont = lpeg.R("\128\191") -- continuation byte
utf8_codepoint = lpeg.R("\0\127") / string.byte
+ lpeg.R("\194\223") * cont / f2
+ lpeg.R("\224\239") * cont * cont / f3
+ lpeg.R("\240\244") * cont * cont * cont / f4
end
local alnum = re.compile('^\\p{alnum}$')
local alpha = re.compile('^\\p{alpha}$')
local cntrl = re.compile('^\\p{cntrl}$')
local digit = re.compile('^\\p{digit}$')
local graph = re.compile('^\\p{graph}$')
local lower = re.compile('^\\p{lower}$')
local print = re.compile('^\\p{print}$')
local punct = re.compile('^\\p{punct}$')
local space = re.compile('^\\p{space}$')
local upper = re.compile('^\\p{upper}$')
local xdigit = re.compile('^\\p{xdigit}$')
return {
alnum = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(alnum, U.char(c)) end ) ;
alpha = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(alpha, U.char(c)) end ) ;
cntrl = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(cntrl, U.char(c)) end ) ;
digit = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(digit, U.char(c)) end ) ;
graph = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(graph, U.char(c)) end ) ;
lower = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(lower, U.char(c)) end ) ;
print = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(print, U.char(c)) end ) ;
punct = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(punct, U.char(c)) end ) ;
space = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(space, U.char(c)) end ) ;
upper = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(upper, U.char(c)) end ) ;
xdigit = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return not not re.match(digit, U.char(c)) end ) ;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment