Skip to content

Instantly share code, notes, and snippets.

@daurnimator
Created June 20, 2012 08:50
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save daurnimator/2958879 to your computer and use it in GitHub Desktop.
Save daurnimator/2958879 to your computer and use it in GitHub Desktop.
Add utf8 locale to lpeg using ICU (from the C side)
#include "lua.h"
#include "lauxlib.h"
#include "lpeg.h"
#include "unicode/utf.h"
#include "unicode/uchar.h"
static Newpf get_l_newpf ( lua_State *L ) {
Newpf newpattern;
lua_getfield ( L , LUA_REGISTRYINDEX , KEYNEWPATT );
if (!(newpattern = (Newpf)lua_tocfunction ( L , -1 ) )) {
lua_pushstring ( L , "newpf not found" );
lua_error ( L ); /* longjmp */
}
lua_pop(L, 1);
return newpattern;
}
static const char * match_utf8_n (const char *s, const char *e, const char *o, const void *ud) {
int n = *(int*)ud;
int i = s-o;
UChar32 codepoint;
for ( ; n > 0 ; n-- ) {
if (i >= (e-o)) return NULL;
U8_NEXT ( o , i , e-o , codepoint );
if ( codepoint < 0 ) return NULL;
}
return o+i;
}
static int nchars ( lua_State *L ) {
int n = luaL_checkint ( L , 1 );
get_l_newpf ( L ) ( L , match_utf8_n , &n , sizeof(int) );
return 1;
}
static const char * match_utf8_range (const char *s, const char *e, const char *o, const void *ud) {
const UChar32* codepoint_range = ud;
int i = s-o;
UChar32 codepoint;
U8_NEXT ( o , i , e-o , codepoint );
if ( codepoint < 0 ) return NULL;
if ( codepoint < codepoint_range[0] || codepoint_range[1] < codepoint ) return NULL;
return o+i;
}
static int range ( lua_State *L ) {
size_t l;
const char *r = luaL_checklstring(L, 1, &l);
UChar32 codepoints[2];
int i = 0;
U8_NEXT ( r , i , l , codepoints[0] );
U8_NEXT ( r , i , l , codepoints[1] );
luaL_argcheck(L, codepoints[0] >= 0 && codepoints[1] >= 0 , 1, "range must have two valid characters");
get_l_newpf ( L ) ( L , match_utf8_range , codepoints , sizeof(codepoints) );
return 1;
}
static const char * match_utf8_class (const char *s, const char *e, const char *o, const void *ud) {
UBool (*validator)(UChar32) = *(UBool (**)(UChar32))ud;
int i = s-o;
UChar32 codepoint;
U8_NEXT ( o , i , e-o , codepoint );
if ( codepoint < 0 ) return NULL;
if ( !validator(codepoint) ) return NULL;
return o+i;
}
/* Creates a pattern from `validator` and saves it in the table on top of the stack under the field `name` */
static void reg_class ( lua_State *L , const char * name , UBool (*validator)(UChar32) ) {
get_l_newpf ( L ) ( L , match_utf8_class , (void*)&validator , sizeof( &validator ) );
lua_setfield ( L , -2 , name );
}
static int locale ( lua_State *L ) {
lua_newtable ( L );
reg_class(L,"alnum",&u_isalnum);
reg_class(L,"alpha",&u_isalpha);
reg_class(L,"cntrl",&u_iscntrl);
reg_class(L,"digit",&u_isdigit);
reg_class(L,"graph",&u_isgraph);
reg_class(L,"lower",&u_islower);
reg_class(L,"print",&u_isprint);
reg_class(L,"punct",&u_ispunct);
reg_class(L,"space",&u_isspace);
reg_class(L,"upper",&u_isupper);
reg_class(L,"xdigit",&u_isxdigit);
return 1;
}
static struct luaL_Reg funcs[] = {
{"N", nchars},
{"R", range},
{"locale", locale},
{NULL, NULL}
};
int luaopen_lpeg_utf8 ( lua_State *L ) {
/* require "lpeg" */
lua_getfield ( L , LUA_GLOBALSINDEX , "require" );
lua_pushstring ( L , "lpeg" );
lua_call ( L , 1 , 0 );
lua_newtable ( L );
luaL_register(L, NULL, funcs);
return 1;
}
LIBNAME = lpeg_utf8
OUT = $(LIBNAME).so
LUADIR = /usr/include/lua5.1/
LPEGDIR = ../lpeg-0.10.2/
CFLAGS = -O2 -fpic --pedantic -I$(LUADIR) -I$(LPEGDIR)
CC = gcc
$(OUT): lpeg_utf8.o
$(CC) -O -shared -fpic `pkg-config --libs --cflags icu-uc` -o $(OUT) $<
lpeg_utf8.o: Makefile lpeg_utf8.c
test: test.lua $(OUT)
lua test.lua
clean:
rm *.o *.so
local lib = assert ( package.loadlib ( "./lpeg_utf8.so" , "luaopen_lpeg_utf8" ) )
utf8 = lib()
local locale = utf8.locale()
local lpeg = require "lpeg"
local lpeg_locale = lpeg.locale()
assert(utf8.N(1):match("a") == 2)
assert(utf8.N(1):match("Δ") == 3)
assert(utf8.N(2):match("aΔ") == 4)
assert(locale.lower:match("a"))
assert(not locale.upper:match("a"))
local ascii_lower = utf8.R"az"
assert(not ascii_lower:match("`"))
assert(ascii_lower:match("a"))
assert(ascii_lower:match("m"))
assert(ascii_lower:match("z"))
assert(not ascii_lower:match("{"))
local some_hieroglyphs = utf8.R"𓀁𓐭"
assert(not some_hieroglyphs:match("𓀀"))
assert(some_hieroglyphs:match("𓀁"))
assert(some_hieroglyphs:match("𓋷"))
assert(some_hieroglyphs:match("𓐭"))
assert(not some_hieroglyphs:match("𓐮"))
local pass = (lpeg.P('_') + locale.alnum) + lpeg.P('-')
assert(pass:match("foo_bar"))
local fail = (lpeg_locale.alnum + lpeg.P('_')) + lpeg.P('-')
assert(fail:match("foo_bar"))
local trim_space = lpeg.P(' ')^0 * lpeg.C(utf8.N(1)^0)
local input = 'foobar'
result = lpeg.match(trim_space, input)
print(input:byte(1,-1))
print(result:byte(1,-1))
assert(input == result)
print("TESTS ALL PASSED")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment