Skip to content

Instantly share code, notes, and snippets.

@shirok
Created October 22, 2011 09:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save shirok/1305809 to your computer and use it in GitHub Desktop.
Save shirok/1305809 to your computer and use it in GitHub Desktop.
diff --git a/src/gauche/regexp.h b/src/gauche/regexp.h
index 298b86a..2c1a348 100644
--- a/src/gauche/regexp.h
+++ b/src/gauche/regexp.h
@@ -49,6 +49,11 @@ struct ScmRegexpRec {
int numSets; /* # of charsets in sets */
int flags; /* internal; CASE_FOLD, BOL_ANCHORED etc. */
ScmString *mustMatch;
+ ScmObj laset; /* lookahead set (char-set) or #f.
+ If not #f, it represents the condition that can
+ match at the beginning of the regexp. It can be
+ used to skip input start position when regexp
+ isn't BOL_ANCHORED. */
};
struct ScmRegMatchRec {
diff --git a/src/regexp.c b/src/regexp.c
index 01721d1..8e6074e 100644
--- a/src/regexp.c
+++ b/src/regexp.c
@@ -161,7 +161,11 @@ enum {
#define MAX_LIMITED_REPEAT 255
/* internal regexp flag. */
-#define SCM_REGEXP_BOL_ANCHORED (1L<<2)
+#define SCM_REGEXP_BOL_ANCHORED (1L<<2) /* The regexp beginning is anchored
+ by ^.*/
+#define SCM_REGEXP_SIMPLE_PREFIX (1L<<3) /* The regexp begins with a repeating
+ character or charset, e.g. #/a+b/.
+ See is_simple_prefixed() below. */
/* AST - the first pass of regexp compiler creates intermediate AST.
* Alternatively, you can provide AST directly to the regexp compiler,
@@ -1696,8 +1700,12 @@ static int is_bol_anchored(ScmObj ast)
else return FALSE;
}
type = SCM_CAR(ast);
- if (SCM_INTP(type) || SCM_EQ(type, SCM_SYM_SEQ)
- || SCM_EQ(type, SCM_SYM_SEQ_UNCASE) || SCM_EQ(type, SCM_SYM_SEQ_CASE)) {
+ if (SCM_INTP(type)) {
+ if (!SCM_PAIRP(SCM_CDDR(ast))) return FALSE;
+ return is_bol_anchored(SCM_CAR(SCM_CDDR(ast)));
+ } else if (SCM_EQ(type, SCM_SYM_SEQ)
+ || SCM_EQ(type, SCM_SYM_SEQ_UNCASE)
+ || SCM_EQ(type, SCM_SYM_SEQ_CASE)) {
if (!SCM_PAIRP(SCM_CDR(ast))) return FALSE;
return is_bol_anchored(SCM_CADR(ast));
}
@@ -1711,11 +1719,133 @@ static int is_bol_anchored(ScmObj ast)
return FALSE;
}
+/* Aux function for is_simple_prefixed.
+ Returns TRUE if AST is <char>, <char-set>, or (comp . <char-set>)*/
+static int is_char_or_charset(ScmObj ast)
+{
+ if (SCM_CHARP(ast) || SCM_CHAR_SET_P(ast)
+ || (SCM_PAIRP(ast)
+ && SCM_EQ(SCM_CAR(ast), SCM_SYM_COMP)
+ && SCM_CHAR_SET_P(SCM_CDR(ast)))) {
+ return TRUE;
+ } else {
+ return FALSE;
+ }
+}
+
+/* Returns TRUE iff ast has a form #/A+B/ where A is a char or charset,
+ and B begins with distinct charset from A (B may be empty).
+ After optimization, the AST begins with (rep-while 1 #f A).
+ If so, we can greatly optimize the failure case.
+ Suppose if we try input s against #/A+B/ and find it fail. Then
+ we can skip prefix of s as far as it matches #/A/. */
+static int is_simple_prefixed(ScmObj ast)
+{
+ ScmObj car;
+
+ if (!SCM_PAIRP(ast)) return FALSE;
+ car = SCM_CAR(ast);
+ if (SCM_EQ(car, SCM_SYM_REP_WHILE)) {
+ if (SCM_EQ(SCM_CADR(ast), SCM_MAKE_INT(1))
+ && SCM_FALSEP(SCM_CAR(SCM_CDDR(ast)))) {
+ ScmObj body = SCM_CDR(SCM_CDDR(ast));
+ if (SCM_PAIRP(body) && SCM_NULLP(SCM_CDR(body))) {
+ return is_char_or_charset(SCM_CAR(body));
+ }
+ }
+ return FALSE;
+ } else if (SCM_EQ(car, SCM_SYM_SEQ)) { /* TODO: handle uncase */
+ if (SCM_PAIRP(SCM_CDR(ast))) {
+ return is_simple_prefixed(SCM_CADR(ast));
+ }
+ return FALSE;
+ } else if (SCM_INTP(car)) {
+ ScmObj s = SCM_CDDR(ast);
+ if (SCM_PAIRP(s)) return is_simple_prefixed(SCM_CAR(s));
+ }
+ return FALSE;
+}
+
+
+/* returns lookahead set. modifies the first arg. */
+static ScmObj merge_laset(ScmObj la1, ScmObj la2)
+{
+ if (SCM_CHAR_SET_P(la1) && SCM_CHAR_SET_P(la2)) {
+ return Scm_CharSetAdd(SCM_CHAR_SET(la1),
+ SCM_CHAR_SET(la2));
+ } else {
+ return SCM_FALSE;
+ }
+}
+
+static ScmObj calculate_lasetn(ScmObj ast);
+
+/* returns lookahead set. returned charset is fresh.
+ TODO: We can also take advantage of wb and nwb condition to
+ skip the input. */
+static ScmObj calculate_laset(ScmObj head, ScmObj rest)
+{
+ ScmObj head_car, cs;
+
+ if (!SCM_PAIRP(head)) {
+ if (SCM_CHARP(head)) {
+ return Scm_CharSetAddRange(SCM_CHAR_SET(Scm_MakeEmptyCharSet()),
+ SCM_CHAR_VALUE(head),
+ SCM_CHAR_VALUE(head));
+ } else if (SCM_CHAR_SET_P(head)) {
+ return Scm_CharSetCopy(SCM_CHAR_SET(head));
+ }
+ return SCM_FALSE;
+ }
+ head_car = SCM_CAR(head);
+
+ if (SCM_EQ(head_car, SCM_SYM_COMP)) {
+ SCM_ASSERT(SCM_CHAR_SET_P(SCM_CDR(head)));
+ cs = Scm_CharSetCopy(SCM_CHAR_SET(SCM_CDR(head)));
+ return Scm_CharSetComplement(SCM_CHAR_SET(cs));
+ } else if (SCM_EQ(head_car, SCM_SYM_SEQ)||SCM_EQ(head_car, SCM_SYM_ONCE)) {
+ return calculate_lasetn(SCM_CDR(head));
+ } else if (SCM_EQ(head_car, SCM_SYM_ALT)) {
+ ScmObj choices = SCM_CDR(head), r;
+ if (!SCM_PAIRP(choices)) return SCM_FALSE;
+ r = calculate_laset(SCM_CAR(choices), SCM_NIL);
+ choices = SCM_CDR(choices);
+ while (!SCM_FALSEP(r) && SCM_PAIRP(choices)) {
+ r = merge_laset(r, calculate_laset(SCM_CAR(choices), SCM_NIL));
+ choices = SCM_CDR(choices);
+ }
+ return r;
+ } else if (SCM_EQ(head_car, SCM_SYM_REP)
+ || SCM_EQ(head_car, SCM_SYM_REP_WHILE)
+ || SCM_EQ(head_car, SCM_SYM_REP_MIN)) {
+ SCM_ASSERT(SCM_PAIRP(SCM_CDR(head)) && SCM_PAIRP(SCM_CDDR(head)));
+ if (SCM_EQ(SCM_CADR(head), SCM_MAKE_INT(0))) {
+ return merge_laset(calculate_lasetn(SCM_CDR(SCM_CDDR(head))),
+ calculate_lasetn(rest));
+ } else {
+ return calculate_lasetn(SCM_CDR(SCM_CDDR(head)));
+ }
+ } else if (SCM_INTP(head_car)) {
+ SCM_ASSERT(SCM_PAIRP(SCM_CDR(head)));
+ return calculate_lasetn(SCM_CDDR(head));
+ } else {
+ return SCM_FALSE;
+ }
+}
+
+static ScmObj calculate_lasetn(ScmObj ast)
+{
+ if (!SCM_PAIRP(ast)) return SCM_FALSE;
+ else return calculate_laset(SCM_CAR(ast), SCM_CDR(ast));
+}
+
/* pass 3 */
static ScmObj rc3(regcomp_ctx *ctx, ScmObj ast)
{
- /* check if ast is bol-anchored */
+ /* set flags and laset */
if (is_bol_anchored(ast)) ctx->rx->flags |= SCM_REGEXP_BOL_ANCHORED;
+ else if (is_simple_prefixed(ast)) ctx->rx->flags |= SCM_REGEXP_SIMPLE_PREFIX;
+ ctx->rx->laset = calculate_laset(ast, SCM_NIL);
/* pass 3-1 : count # of insns */
ctx->codemax = 1;
@@ -1740,7 +1870,13 @@ void Scm_RegDump(ScmRegexp *rx)
{
int end = rx->numCodes, codep;
- Scm_Printf(SCM_CUROUT, "Regexp %p: (flags=%08x)\n", rx, rx->flags);
+ Scm_Printf(SCM_CUROUT, "Regexp %p: (flags=%08x", rx, rx->flags);
+ if (rx->flags&SCM_REGEXP_BOL_ANCHORED)
+ Scm_Printf(SCM_CUROUT, ",BOL_ANCHORED");
+ if (rx->flags&SCM_REGEXP_SIMPLE_PREFIX)
+ Scm_Printf(SCM_CUROUT, ",SIMPLE_PREFIX");
+ Scm_Printf(SCM_CUROUT, ")\n");
+ Scm_Printf(SCM_CUROUT, " laset = %S\n", rx->laset);
Scm_Printf(SCM_CUROUT, " must = ");
if (rx->mustMatch) {
Scm_Printf(SCM_CUROUT, "%S\n", rx->mustMatch);
@@ -2600,6 +2736,24 @@ static ScmObj rex(ScmRegexp *rx, ScmString *orig,
return make_match(rx, orig, &ctx);
}
+/* advance start pointer while the character matches (skip_match=TRUE) or does
+ not match (skip_match=FALSE), until start pointer hits limit. */
+static inline const char *skip_input(const char *start, const char *limit,
+ ScmObj laset, int skip_match)
+{
+ while (start <= limit) {
+ ScmChar ch;
+ SCM_CHAR_GET(start, ch);
+ if (Scm_CharSetContains(SCM_CHAR_SET(laset), ch)) {
+ if (!skip_match) return start;
+ } else {
+ if (skip_match) return start;
+ }
+ start += SCM_CHAR_NFOLLOWS(*start)+1;
+ }
+ return limit;
+}
+
/*----------------------------------------------------------------------
* entry point
*/
@@ -2610,6 +2764,7 @@ ScmObj Scm_RegExec(ScmRegexp *rx, ScmString *str)
const char *end = start + SCM_STRING_BODY_SIZE(b);
const ScmStringBody *mb = rx->mustMatch? SCM_STRING_BODY(rx->mustMatch) : NULL;
int mustMatchLen = mb? SCM_STRING_BODY_SIZE(mb) : 0;
+ const char *start_limit = end - mustMatchLen;
if (SCM_STRING_INCOMPLETE_P(str)) {
Scm_Error("incomplete string is not allowed: %S", str);
@@ -2633,8 +2788,33 @@ ScmObj Scm_RegExec(ScmRegexp *rx, ScmString *str)
if (rx->flags & SCM_REGEXP_BOL_ANCHORED) {
return rex(rx, str, start, end);
}
+
+ /* if we have lookahead-set, we may be able to skip input efficiently. */
+ if (!SCM_FALSEP(rx->laset)) {
+ ScmObj r;
+ const char *next;
+
+ if (rx->flags & SCM_REGEXP_SIMPLE_PREFIX) {
+ while (start <= start_limit) {
+ r = rex(rx, str, start, end);
+ if (!SCM_FALSEP(r)) return r;
+ next = skip_input(start, start_limit, rx->laset, TRUE);
+ if (start != next) start = next;
+ else start = next + SCM_CHAR_NFOLLOWS(*start) + 1;
+ }
+ } else {
+ while (start <= start_limit) {
+ start = skip_input(start, start_limit, rx->laset, FALSE);
+ r = rex(rx, str, start, end);
+ if (!SCM_FALSEP(r)) return r;
+ start += SCM_CHAR_NFOLLOWS(*start)+1;
+ }
+ }
+ return SCM_FALSE;
+ }
+
/* normal matching */
- while (start <= end-mustMatchLen) {
+ while (start <= start_limit) {
ScmObj r = rex(rx, str, start, end);
if (!SCM_FALSEP(r)) return r;
start += SCM_CHAR_NFOLLOWS(*start)+1;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment