Last active
August 29, 2015 13:59
-
-
Save hishamhm/10814558 to your computer and use it in GitHub Desktop.
Proof-of-concept patch for UTF-8 patterns in Lua 5.3work2
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Only in lua-5.3.0-work2-utf8patterns: lstrlib.o | |
diff -ur lua-5.3.0-work2/src/lstrlib.c lua-5.3.0-work2-utf8patterns/src/lstrlib.c | |
--- lua-5.3.0-work2/src/lstrlib.c 2014-03-21 11:26:44.000000000 -0300 | |
+++ lua-5.3.0-work2-utf8patterns/src/lstrlib.c 2014-04-16 21:47:21.000000000 -0300 | |
@@ -209,6 +209,7 @@ | |
const char *p_end; /* end ('\0') of pattern */ | |
lua_State *L; | |
int level; /* total number of captures (finished or unfinished) */ | |
+ int isutf8; /* true if input and pattern are UTF-8 */ | |
struct { | |
const char *init; | |
ptrdiff_t len; | |
@@ -229,6 +230,20 @@ | |
#define L_ESC '%' | |
#define SPECIALS "^$*+?.([%-" | |
+#define iscont(p) ((*(p) & 0xC0) == 0x80) | |
+/* unrolled loops to traverse UTF-8 codepoints */ | |
+#define NEXT(ut,p) ((!ut ? p+1 : ((!iscont(p+1)) ? p+1 : \ | |
+ ((!iscont(p+2)) ? p+2 : \ | |
+ ((!iscont(p+3)) ? p+3 : p+4 ))))) | |
+#define PREV(ut,p) ((!ut ? p-1 : (iscont(p-1) ? (iscont(p-2) ? \ | |
+ (iscont(p-3) ? p-4 : p-3) : p-2) : p-1))) | |
+#define CHAR_AT(ut,p) (!ut ? uchar(*(p)) : \ | |
+ ( uchar(*p) | (iscont(p+1) ? (uchar(*(p+1)) << 8 ) | \ | |
+ (iscont(p+2) ? (uchar(*(p+2)) << 16) | \ | |
+ (iscont(p+3) ? (uchar(*(p+3)) << 24) \ | |
+ : 0) : 0) : 0) )) | |
+#define INC(ut,p) do{ p = NEXT(ut,p); }while(0) | |
+#define DEC(ut,p) do{ p = PREV(ut,p); }while(0) | |
static int check_capture (MatchState *ms, int l) { | |
l -= '1'; | |
@@ -247,6 +262,7 @@ | |
static const char *classend (MatchState *ms, const char *p) { | |
+ int ut = ms->isutf8; | |
switch (*p++) { | |
case L_ESC: { | |
if (p == ms->p_end) | |
@@ -258,8 +274,9 @@ | |
do { /* look for a `]' */ | |
if (p == ms->p_end) | |
luaL_error(ms->L, "malformed pattern (missing " LUA_QL("]") ")"); | |
- if (*(p++) == L_ESC && p < ms->p_end) | |
- p++; /* skip escapes (e.g. `%]') */ | |
+ if (*(NEXT(ut,p)) == L_ESC && p < ms->p_end) | |
+ INC(ms,p); /* skip escapes (e.g. `%]') */ | |
+ INC(ms,p); | |
} while (*p != ']'); | |
return p+1; | |
} | |
@@ -270,8 +287,10 @@ | |
} | |
-static int match_class (int c, int cl) { | |
+static int match_class (int ut, int c, int cl) { | |
int res; | |
+ if (ut) | |
+ return (cl == c); /* "%" classes are ignored in UTF-8 mode */ | |
switch (tolower(cl)) { | |
case 'a' : res = isalpha(c); break; | |
case 'c' : res = iscntrl(c); break; | |
@@ -290,7 +309,7 @@ | |
} | |
-static int matchbracketclass (int c, const char *p, const char *ec) { | |
+static int matchbracketclass (int ut, int c, const char *p, const char *ec) { | |
int sig = 1; | |
if (*(p+1) == '^') { | |
sig = 0; | |
@@ -299,15 +318,17 @@ | |
while (++p < ec) { | |
if (*p == L_ESC) { | |
p++; | |
- if (match_class(c, uchar(*p))) | |
+ if (match_class(ut, c, CHAR_AT(ut,p))) | |
return sig; | |
} | |
- else if ((*(p+1) == '-') && (p+2 < ec)) { | |
- p+=2; | |
- if (uchar(*(p-2)) <= c && c <= uchar(*p)) | |
- return sig; | |
+ else { | |
+ const char* next = NEXT(ut,p); | |
+ if ((*next == '-') && (next+1 < ec)) { | |
+ if (CHAR_AT(ut,p) <= c && c <= CHAR_AT(ut,next+1)) | |
+ return sig; | |
+ } | |
+ else if (CHAR_AT(ut,p) == c) return sig; | |
} | |
- else if (uchar(*p) == c) return sig; | |
} | |
return !sig; | |
} | |
@@ -318,17 +339,18 @@ | |
if (s >= ms->src_end) | |
return 0; | |
else { | |
- int c = uchar(*s); | |
+ int c = CHAR_AT(ms,s); | |
+ int ut = ms->isutf8; | |
switch (*p) { | |
case '.': return 1; /* matches any char */ | |
- case L_ESC: return match_class(c, uchar(*(p+1))); | |
- case '[': return matchbracketclass(c, p, ep-1); | |
- default: return (uchar(*p) == c); | |
+ case L_ESC: return match_class(ut, c, CHAR_AT(ms,p+1)); | |
+ case '[': return matchbracketclass(ut, c, p, ep-1); | |
+ default: return (CHAR_AT(ms,p) == c); | |
} | |
} | |
} | |
- | |
+/* TODO Support UTF-8 */ | |
static const char *matchbalance (MatchState *ms, const char *s, | |
const char *p) { | |
if (p >= ms->p_end - 1) | |
@@ -352,14 +374,15 @@ | |
static const char *max_expand (MatchState *ms, const char *s, | |
const char *p, const char *ep) { | |
- ptrdiff_t i = 0; /* counts maximum expand for item */ | |
- while (singlematch(ms, s + i, p, ep)) | |
- i++; | |
+ const char* howfar = s; /* see how far we can go */ | |
+ int ut = ms->isutf8; | |
+ while (singlematch(ms, howfar, p, ep)) | |
+ INC(ms, howfar); | |
/* keeps trying to match with the maximum repetitions */ | |
- while (i>=0) { | |
- const char *res = match(ms, (s+i), ep+1); | |
+ while (howfar > s) { | |
+ const char *res = match(ms, howfar, ep+1); | |
if (res) return res; | |
- i--; /* else didn't match; reduce 1 repetition to try again */ | |
+ DEC(ut,howfar); /* else didn't match; reduce 1 repetition to try again */ | |
} | |
return NULL; | |
} | |
@@ -372,7 +395,7 @@ | |
if (res != NULL) | |
return res; | |
else if (singlematch(ms, s, p, ep)) | |
- s++; /* try with one more repetition */ | |
+ INC(ms, s); /* try with one more repetition */ | |
else return NULL; | |
} | |
} | |
@@ -415,6 +438,7 @@ | |
static const char *match (MatchState *ms, const char *s, const char *p) { | |
+ int ut = ms->isutf8; | |
if (ms->matchdepth-- == 0) | |
luaL_error(ms->L, "pattern too complex"); | |
init: /* using goto's to optimize tail recursion */ | |
@@ -442,20 +466,22 @@ | |
case 'b': { /* balanced string? */ | |
s = matchbalance(ms, s, p + 2); | |
if (s != NULL) { | |
- p += 4; goto init; /* return match(ms, s, p + 4); */ | |
+ p += 2; INC(ut,p); INC(ut,p); | |
+ goto init; /* return match(ms, s, p + 4); */ | |
} /* else fail (s == NULL) */ | |
break; | |
} | |
case 'f': { /* frontier? */ | |
- const char *ep; char previous; | |
+ /* TODO Untested with UTF-8 */ | |
+ const char *ep; const char* previous; | |
p += 2; | |
if (*p != '[') | |
luaL_error(ms->L, "missing " LUA_QL("[") " after " | |
LUA_QL("%%f") " in pattern"); | |
ep = classend(ms, p); /* points to what is next */ | |
- previous = (s == ms->src_init) ? '\0' : *(s - 1); | |
- if (!matchbracketclass(uchar(previous), p, ep - 1) && | |
- matchbracketclass(uchar(*s), p, ep - 1)) { | |
+ previous = (s == ms->src_init) ? "" : PREV(ut, s); | |
+ if (!matchbracketclass(ut, CHAR_AT(ms,previous), p, ep - 1) && | |
+ matchbracketclass(ut, CHAR_AT(ms,s), p, ep - 1)) { | |
p = ep; goto init; /* return match(ms, s, ep); */ | |
} | |
s = NULL; /* match failed */ | |
@@ -488,7 +514,7 @@ | |
switch (*ep) { /* handle optional suffix */ | |
case '?': { /* optional */ | |
const char *res; | |
- if ((res = match(ms, s + 1, ep + 1)) != NULL) | |
+ if ((res = match(ms, NEXT(ut, s), ep + 1)) != NULL) | |
s = res; | |
else { | |
p = ep + 1; goto init; /* else return match(ms, s, ep + 1); */ | |
@@ -496,7 +522,7 @@ | |
break; | |
} | |
case '+': /* 1 or more repetitions */ | |
- s++; /* 1 match already done */ | |
+ INC(ut,s); /* 1 match already done */ | |
/* go through */ | |
case '*': /* 0 or more repetitions */ | |
s = max_expand(ms, s, p, ep); | |
@@ -505,7 +531,7 @@ | |
s = min_expand(ms, s, p, ep); | |
break; | |
default: /* no suffix */ | |
- s++; p = ep; goto init; /* return match(ms, s + 1, ep); */ | |
+ INC(ut,s); p = ep; goto init; /* return match(ms, s + 1, ep); */ | |
} | |
} | |
break; | |
@@ -605,9 +631,11 @@ | |
MatchState ms; | |
const char *s1 = s + init - 1; | |
int anchor = (*p == '^'); | |
+ int ut = 1; /* TODO: set it conditionally */ | |
if (anchor) { | |
p++; lp--; /* skip anchor character */ | |
} | |
+ ms.isutf8 = ut; | |
ms.L = L; | |
ms.matchdepth = MAXCCALLS; | |
ms.src_init = s; | |
@@ -626,7 +654,8 @@ | |
else | |
return push_captures(&ms, s1, res); | |
} | |
- } while (s1++ < ms.src_end && !anchor); | |
+ INC(ut,s1); | |
+ } while (s1 <= ms.src_end && !anchor); | |
} | |
lua_pushnil(L); /* not found */ | |
return 1; | |
@@ -649,14 +678,16 @@ | |
const char *s = lua_tolstring(L, lua_upvalueindex(1), &ls); | |
const char *p = lua_tolstring(L, lua_upvalueindex(2), &lp); | |
const char *src; | |
+ int ut = 1; /* TODO: set it conditionally */ | |
ms.L = L; | |
ms.matchdepth = MAXCCALLS; | |
ms.src_init = s; | |
ms.src_end = s+ls; | |
ms.p_end = p + lp; | |
+ ms.isutf8 = ut; | |
for (src = s + (size_t)lua_tointeger(L, lua_upvalueindex(3)); | |
src <= ms.src_end; | |
- src++) { | |
+ src = NEXT(ut, src)) { | |
const char *e; | |
ms.level = 0; | |
lua_assert(ms.matchdepth == MAXCCALLS); | |
@@ -748,6 +779,7 @@ | |
int anchor = (*p == '^'); | |
size_t n = 0; | |
MatchState ms; | |
+ ms.isutf8 = 1; | |
luaL_Buffer b; | |
luaL_argcheck(L, tr == LUA_TNUMBER || tr == LUA_TSTRING || | |
tr == LUA_TFUNCTION || tr == LUA_TTABLE, 3, |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment