Skip to content

Instantly share code, notes, and snippets.

@hishamhm
Last active August 29, 2015 13:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hishamhm/10814558 to your computer and use it in GitHub Desktop.
Save hishamhm/10814558 to your computer and use it in GitHub Desktop.
Proof-of-concept patch for UTF-8 patterns in Lua 5.3work2
Only in lua-5.3.0-work2-utf8patterns: lstrlib.o
diff -ur lua-5.3.0-work2/src/lstrlib.c lua-5.3.0-work2-utf8patterns/src/lstrlib.c
--- lua-5.3.0-work2/src/lstrlib.c 2014-03-21 11:26:44.000000000 -0300
+++ lua-5.3.0-work2-utf8patterns/src/lstrlib.c 2014-04-16 21:47:21.000000000 -0300
@@ -209,6 +209,7 @@
const char *p_end; /* end ('\0') of pattern */
lua_State *L;
int level; /* total number of captures (finished or unfinished) */
+ int isutf8; /* true if input and pattern are UTF-8 */
struct {
const char *init;
ptrdiff_t len;
@@ -229,6 +230,20 @@
#define L_ESC '%'
#define SPECIALS "^$*+?.([%-"
+#define iscont(p) ((*(p) & 0xC0) == 0x80)
+/* unrolled loops to traverse UTF-8 codepoints */
+#define NEXT(ut,p) ((!ut ? p+1 : ((!iscont(p+1)) ? p+1 : \
+ ((!iscont(p+2)) ? p+2 : \
+ ((!iscont(p+3)) ? p+3 : p+4 )))))
+#define PREV(ut,p) ((!ut ? p-1 : (iscont(p-1) ? (iscont(p-2) ? \
+ (iscont(p-3) ? p-4 : p-3) : p-2) : p-1)))
+#define CHAR_AT(ut,p) (!ut ? uchar(*(p)) : \
+ ( uchar(*p) | (iscont(p+1) ? (uchar(*(p+1)) << 8 ) | \
+ (iscont(p+2) ? (uchar(*(p+2)) << 16) | \
+ (iscont(p+3) ? (uchar(*(p+3)) << 24) \
+ : 0) : 0) : 0) ))
+#define INC(ut,p) do{ p = NEXT(ut,p); }while(0)
+#define DEC(ut,p) do{ p = PREV(ut,p); }while(0)
static int check_capture (MatchState *ms, int l) {
l -= '1';
@@ -247,6 +262,7 @@
static const char *classend (MatchState *ms, const char *p) {
+ int ut = ms->isutf8;
switch (*p++) {
case L_ESC: {
if (p == ms->p_end)
@@ -258,8 +274,9 @@
do { /* look for a `]' */
if (p == ms->p_end)
luaL_error(ms->L, "malformed pattern (missing " LUA_QL("]") ")");
- if (*(p++) == L_ESC && p < ms->p_end)
- p++; /* skip escapes (e.g. `%]') */
+ if (*(NEXT(ut,p)) == L_ESC && p < ms->p_end)
+ INC(ms,p); /* skip escapes (e.g. `%]') */
+ INC(ms,p);
} while (*p != ']');
return p+1;
}
@@ -270,8 +287,10 @@
}
-static int match_class (int c, int cl) {
+static int match_class (int ut, int c, int cl) {
int res;
+ if (ut)
+ return (cl == c); /* "%" classes are ignored in UTF-8 mode */
switch (tolower(cl)) {
case 'a' : res = isalpha(c); break;
case 'c' : res = iscntrl(c); break;
@@ -290,7 +309,7 @@
}
-static int matchbracketclass (int c, const char *p, const char *ec) {
+static int matchbracketclass (int ut, int c, const char *p, const char *ec) {
int sig = 1;
if (*(p+1) == '^') {
sig = 0;
@@ -299,15 +318,17 @@
while (++p < ec) {
if (*p == L_ESC) {
p++;
- if (match_class(c, uchar(*p)))
+ if (match_class(ut, c, CHAR_AT(ut,p)))
return sig;
}
- else if ((*(p+1) == '-') && (p+2 < ec)) {
- p+=2;
- if (uchar(*(p-2)) <= c && c <= uchar(*p))
- return sig;
+ else {
+ const char* next = NEXT(ut,p);
+ if ((*next == '-') && (next+1 < ec)) {
+ if (CHAR_AT(ut,p) <= c && c <= CHAR_AT(ut,next+1))
+ return sig;
+ }
+ else if (CHAR_AT(ut,p) == c) return sig;
}
- else if (uchar(*p) == c) return sig;
}
return !sig;
}
@@ -318,17 +339,18 @@
if (s >= ms->src_end)
return 0;
else {
- int c = uchar(*s);
+ int c = CHAR_AT(ms,s);
+ int ut = ms->isutf8;
switch (*p) {
case '.': return 1; /* matches any char */
- case L_ESC: return match_class(c, uchar(*(p+1)));
- case '[': return matchbracketclass(c, p, ep-1);
- default: return (uchar(*p) == c);
+ case L_ESC: return match_class(ut, c, CHAR_AT(ms,p+1));
+ case '[': return matchbracketclass(ut, c, p, ep-1);
+ default: return (CHAR_AT(ms,p) == c);
}
}
}
-
+/* TODO Support UTF-8 */
static const char *matchbalance (MatchState *ms, const char *s,
const char *p) {
if (p >= ms->p_end - 1)
@@ -352,14 +374,15 @@
static const char *max_expand (MatchState *ms, const char *s,
const char *p, const char *ep) {
- ptrdiff_t i = 0; /* counts maximum expand for item */
- while (singlematch(ms, s + i, p, ep))
- i++;
+ const char* howfar = s; /* see how far we can go */
+ int ut = ms->isutf8;
+ while (singlematch(ms, howfar, p, ep))
+ INC(ms, howfar);
/* keeps trying to match with the maximum repetitions */
- while (i>=0) {
- const char *res = match(ms, (s+i), ep+1);
+ while (howfar > s) {
+ const char *res = match(ms, howfar, ep+1);
if (res) return res;
- i--; /* else didn't match; reduce 1 repetition to try again */
+ DEC(ut,howfar); /* else didn't match; reduce 1 repetition to try again */
}
return NULL;
}
@@ -372,7 +395,7 @@
if (res != NULL)
return res;
else if (singlematch(ms, s, p, ep))
- s++; /* try with one more repetition */
+ INC(ms, s); /* try with one more repetition */
else return NULL;
}
}
@@ -415,6 +438,7 @@
static const char *match (MatchState *ms, const char *s, const char *p) {
+ int ut = ms->isutf8;
if (ms->matchdepth-- == 0)
luaL_error(ms->L, "pattern too complex");
init: /* using goto's to optimize tail recursion */
@@ -442,20 +466,22 @@
case 'b': { /* balanced string? */
s = matchbalance(ms, s, p + 2);
if (s != NULL) {
- p += 4; goto init; /* return match(ms, s, p + 4); */
+ p += 2; INC(ut,p); INC(ut,p);
+ goto init; /* return match(ms, s, p + 4); */
} /* else fail (s == NULL) */
break;
}
case 'f': { /* frontier? */
- const char *ep; char previous;
+ /* TODO Untested with UTF-8 */
+ const char *ep; const char* previous;
p += 2;
if (*p != '[')
luaL_error(ms->L, "missing " LUA_QL("[") " after "
LUA_QL("%%f") " in pattern");
ep = classend(ms, p); /* points to what is next */
- previous = (s == ms->src_init) ? '\0' : *(s - 1);
- if (!matchbracketclass(uchar(previous), p, ep - 1) &&
- matchbracketclass(uchar(*s), p, ep - 1)) {
+ previous = (s == ms->src_init) ? "" : PREV(ut, s);
+ if (!matchbracketclass(ut, CHAR_AT(ms,previous), p, ep - 1) &&
+ matchbracketclass(ut, CHAR_AT(ms,s), p, ep - 1)) {
p = ep; goto init; /* return match(ms, s, ep); */
}
s = NULL; /* match failed */
@@ -488,7 +514,7 @@
switch (*ep) { /* handle optional suffix */
case '?': { /* optional */
const char *res;
- if ((res = match(ms, s + 1, ep + 1)) != NULL)
+ if ((res = match(ms, NEXT(ut, s), ep + 1)) != NULL)
s = res;
else {
p = ep + 1; goto init; /* else return match(ms, s, ep + 1); */
@@ -496,7 +522,7 @@
break;
}
case '+': /* 1 or more repetitions */
- s++; /* 1 match already done */
+ INC(ut,s); /* 1 match already done */
/* go through */
case '*': /* 0 or more repetitions */
s = max_expand(ms, s, p, ep);
@@ -505,7 +531,7 @@
s = min_expand(ms, s, p, ep);
break;
default: /* no suffix */
- s++; p = ep; goto init; /* return match(ms, s + 1, ep); */
+ INC(ut,s); p = ep; goto init; /* return match(ms, s + 1, ep); */
}
}
break;
@@ -605,9 +631,11 @@
MatchState ms;
const char *s1 = s + init - 1;
int anchor = (*p == '^');
+ int ut = 1; /* TODO: set it conditionally */
if (anchor) {
p++; lp--; /* skip anchor character */
}
+ ms.isutf8 = ut;
ms.L = L;
ms.matchdepth = MAXCCALLS;
ms.src_init = s;
@@ -626,7 +654,8 @@
else
return push_captures(&ms, s1, res);
}
- } while (s1++ < ms.src_end && !anchor);
+ INC(ut,s1);
+ } while (s1 <= ms.src_end && !anchor);
}
lua_pushnil(L); /* not found */
return 1;
@@ -649,14 +678,16 @@
const char *s = lua_tolstring(L, lua_upvalueindex(1), &ls);
const char *p = lua_tolstring(L, lua_upvalueindex(2), &lp);
const char *src;
+ int ut = 1; /* TODO: set it conditionally */
ms.L = L;
ms.matchdepth = MAXCCALLS;
ms.src_init = s;
ms.src_end = s+ls;
ms.p_end = p + lp;
+ ms.isutf8 = ut;
for (src = s + (size_t)lua_tointeger(L, lua_upvalueindex(3));
src <= ms.src_end;
- src++) {
+ src = NEXT(ut, src)) {
const char *e;
ms.level = 0;
lua_assert(ms.matchdepth == MAXCCALLS);
@@ -748,6 +779,7 @@
int anchor = (*p == '^');
size_t n = 0;
MatchState ms;
+ ms.isutf8 = 1;
luaL_Buffer b;
luaL_argcheck(L, tr == LUA_TNUMBER || tr == LUA_TSTRING ||
tr == LUA_TFUNCTION || tr == LUA_TTABLE, 3,
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment