Skip to content

Instantly share code, notes, and snippets.

@daurnimator
Created June 17, 2012 04:28
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 4 You must be signed in to fork a gist
  • Save daurnimator/2943418 to your computer and use it in GitHub Desktop.
Save daurnimator/2943418 to your computer and use it in GitHub Desktop.
Adding UTF8 support for lpeg
local lpeg = require "lpeg"
local utf8_codepoint
do
-- decode a two-byte UTF-8 sequence
local function f2 (s)
local c1, c2 = string.byte(s, 1, 2)
return c1 * 64 + c2 - 12416
end
-- decode a three-byte UTF-8 sequence
local function f3 (s)
local c1, c2, c3 = string.byte(s, 1, 3)
return (c1 * 64 + c2) * 64 + c3 - 925824
end
-- decode a four-byte UTF-8 sequence
local function f4 (s)
local c1, c2, c3, c4 = string.byte(s, 1, 4)
return ((c1 * 64 + c2) * 64 + c3) * 64 + c4 - 63447168
end
local cont = lpeg.R("\128\191") -- continuation byte
utf8_codepoint = lpeg.R("\0\127") / string.byte
+ lpeg.R("\194\223") * cont / f2
+ lpeg.R("\224\239") * cont * cont / f3
+ lpeg.R("\240\244") * cont * cont * cont / f4
end
local ffi = require "ffi"
ffi.cdef [[
typedef bool UBool;
typedef int32_t UChar32;
UBool u_islower_49(UChar32 c);
UBool u_isupper_49(UChar32 c);
UBool u_isdigit_49(UChar32 c);
UBool u_isalpha_49(UChar32 c);
UBool u_isalnum_49(UChar32 c);
UBool u_isxdigit_49(UChar32 c);
UBool u_ispunct_49(UChar32 c);
UBool u_isgraph_49(UChar32 c);
UBool u_isspace_49(UChar32 c);
UBool u_iscntrl_49(UChar32 c);
UBool u_isprint_49(UChar32 c);
]]
local ICU = ffi.load ( "icuuc" )
local utf8_locale = {
alnum = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_isalnum_49(c) end ) ;
alpha = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_isalpha_49(c) end ) ;
cntrl = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_iscntrl_49(c) end ) ;
digit = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_isdigit_49(c) end ) ;
graph = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_isgraph_49(c) end ) ;
lower = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_islower_49(c) end ) ;
print = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_isprint_49(c) end ) ;
punct = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_ispunct_49(c) end ) ;
space = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_isspace_49(c) end ) ;
upper = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_isupper_49(c) end ) ;
xdigit = lpeg.Cmt ( utf8_codepoint , function (s,i,c) return ICU.u_isxdigit_49(c) end ) ;
}
return {
utf8_codepoint = utf8_codepoint ;
utf8_locale = utf8_locale ;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment