Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save leoliu/1900111 to your computer and use it in GitHub Desktop.
Save leoliu/1900111 to your computer and use it in GitHub Desktop.
lookaround regexp extension for Emacs 23.4
From 5aa9e8dfef5a801ad4a9dd9e01a72c4386e3baa3 Mon Sep 17 00:00:00 2001
From: Leo <sdl.web@gmail.com>
Date: Sun, 22 May 2011 14:17:52 +0800
Subject: [PATCH] Merge patch for lookaround regexp extension
Patch extracted from the bzr repo on
http://cx4a.org/hack/emacs-regexp-extension.html and also available on
http://paste.pocoo.org/show/393041.
---
src/regex.c | 324 ++++++++++++++++++++++++++++++++++++++++++++--
test/regexp-testsuite.el | 280 +++++++++++++++++++++++++++++++++++++++
2 files changed, 591 insertions(+), 13 deletions(-)
create mode 100644 test/regexp-testsuite.el
diff --git a/src/regex.c b/src/regex.c
index a3a4d97c..740e99bf 100644
--- a/src/regex.c
+++ b/src/regex.c
@@ -736,7 +736,14 @@ typedef enum
syntaxspec,
/* Matches any character whose syntax is not that specified. */
- notsyntaxspec
+ notsyntaxspec,
+
+ lookahead,
+ lookahead_not,
+ lookbehind,
+ lookbehind_not,
+ lookaround_succeed,
+ lookaround_fail
#ifdef emacs
,before_dot, /* Succeeds if before point. */
@@ -1034,6 +1041,36 @@ print_partial_compiled_pattern (start, end)
fprintf (stderr, "/stop_memory/%d", *p++);
break;
+ case lookahead:
+ extract_number_and_incr (&mcnt, &p);
+ fprintf (stderr, "/lookahead/%d", mcnt);
+ break;
+
+ case lookahead_not:
+ extract_number_and_incr (&mcnt, &p);
+ fprintf (stderr, "/lookahead_not/%d", mcnt);
+ break;
+
+ case lookbehind:
+ extract_number_and_incr (&mcnt, &p);
+ extract_number_and_incr (&mcnt2, &p);
+ fprintf (stderr, "/lookbehind/%d/%d", mcnt, mcnt2);
+ break;
+
+ case lookbehind_not:
+ extract_number_and_incr (&mcnt, &p);
+ extract_number_and_incr (&mcnt2, &p);
+ fprintf (stderr, "/lookbehind_not/%d/%d", mcnt, mcnt2);
+ break;
+
+ case lookaround_succeed:
+ fprintf (stderr, "/lookaround_succeed");
+ break;
+
+ case lookaround_fail:
+ fprintf (stderr, "/lookaround_fail");
+ break;
+
case duplicate:
fprintf (stderr, "/duplicate/%d", *p++);
break;
@@ -1601,11 +1638,17 @@ do { \
} \
else \
{ \
- regend[reg] = POP_FAILURE_POINTER (); \
- regstart[reg] = POP_FAILURE_POINTER (); \
- DEBUG_PRINT4 (" Pop reg %d (spanning %p -> %p)\n", \
- reg, regstart[reg], regend[reg]); \
- } \
+ re_char *start, *end; \
+ end = POP_FAILURE_POINTER (); \
+ start = POP_FAILURE_POINTER (); \
+ if (!discard_saved_regs) \
+ { \
+ regstart[reg] = start; \
+ regend[reg] = end; \
+ DEBUG_PRINT4 (" Pop reg %d (spanning %p -> %p)\n", \
+ reg, regstart[reg], regend[reg]); \
+ } \
+ } \
} while (0)
/* Check that we are not stuck in an infinite loop. */
@@ -1703,7 +1746,7 @@ do { \
while (fail_stack.frame < fail_stack.avail) \
POP_FAILURE_REG_OR_COUNT (); \
\
- pat = POP_FAILURE_POINTER (); \
+ pat = POP_FAILURE_POINTER (); \
DEBUG_PRINT2 (" Popping pattern %p: ", pat); \
DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \
\
@@ -1725,6 +1768,29 @@ do { \
} while (0) /* POP_FAILURE_POINT */
+#define FINISH_LOOKAROUND() \
+ do { \
+ re_char *str, *pat; \
+ re_opcode_t op; \
+ discard_saved_regs = 1; \
+ while (!FAIL_STACK_EMPTY ()) \
+ { \
+ POP_FAILURE_POINT (str, pat); \
+ op = (re_opcode_t) *pat; \
+ if (op == lookahead \
+ || op == lookahead_not \
+ || op == lookbehind \
+ || op == lookbehind_not) \
+ { \
+ d = str; \
+ dend = ((d >= string1 && d <= end1) \
+ ? end_match_1 : end_match_2); \
+ break; \
+ } \
+ } \
+ discard_saved_regs = 0; \
+ } while (0);
+
/* Registers are set to a sentinel when they haven't yet matched. */
#define REG_UNSET(e) ((e) == NULL)
@@ -1923,6 +1989,7 @@ typedef struct
pattern_offset_t fixup_alt_jump;
pattern_offset_t laststart_offset;
regnum_t regnum;
+ int lookaround;
} compile_stack_elt_t;
@@ -2523,6 +2590,8 @@ static boolean group_in_compile_stack _RE_ARGS ((compile_stack_type
compile_stack,
regnum_t regnum));
+static int exact_chars_in_pattern_buffer _RE_ARGS ((struct re_pattern_buffer *bufp, re_char *p, re_char *pend));
+
/* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
Returns one of error codes defined in `regex.h', or zero for success.
@@ -3269,6 +3338,7 @@ regex_compile (pattern, size, syntax, bufp)
handle_open:
{
int shy = 0;
+ int lookaround = 0;
regnum_t regnum = 0;
if (p+1 < pend)
{
@@ -3290,6 +3360,27 @@ regex_compile (pattern, size, syntax, bufp)
case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
regnum = 10*regnum + (c - '0'); break;
+ case '=':
+ /* Positive lookahead assertion. */
+ shy = lookaround = 1;
+ break;
+ case '!':
+ /* Negative lookahead assertion. */
+ shy = lookaround = 2;
+ break;
+ case '<':
+ {
+ PATFETCH (c);
+ if (c == '=')
+ /* Positive lookbehind assertion. */
+ shy = lookaround = -1;
+ else if (c == '!')
+ /* Negative lookbehind assertion. */
+ shy = lookaround = -2;
+ else
+ FREE_STACK_RETURN (REG_BADPAT);
+ }
+ break;
default:
/* Only (?:...) is supported right now. */
FREE_STACK_RETURN (REG_BADPAT);
@@ -3336,6 +3427,7 @@ regex_compile (pattern, size, syntax, bufp)
= fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
COMPILE_STACK_TOP.regnum = regnum;
+ COMPILE_STACK_TOP.lookaround = lookaround;
/* Do not push a start_memory for groups beyond the last one
we can represent in the compiled pattern. */
@@ -3385,6 +3477,7 @@ regex_compile (pattern, size, syntax, bufp)
later groups should continue to be numbered higher,
as in `(ab)c(de)' -- the second group is #2. */
regnum_t regnum;
+ int lookaround;
compile_stack.avail--;
begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
@@ -3397,13 +3490,40 @@ regex_compile (pattern, size, syntax, bufp)
/* If we've reached MAX_REGNUM groups, then this open
won't actually generate any code, so we'll have to
clear pending_exact explicitly. */
+ lookaround = COMPILE_STACK_TOP.lookaround;
pending_exact = 0;
/* We're at the end of the group, so now we know how many
groups were inside this one. */
if (regnum <= MAX_REGNUM && regnum > 0)
BUF_PUSH_2 (stop_memory, regnum);
- }
+ else if (lookaround)
+ {
+ if (lookaround > 0)
+ {
+ /* Positive/negative lookahead assertion. */
+ GET_BUFFER_SPACE (3);
+ INSERT_JUMP (lookaround == 1 ? lookahead : lookahead_not, laststart, b + 4);
+ b += 3;
+ }
+ else
+ {
+ /* Positive/negative lookbehind assertion. */
+ int count = exact_chars_in_pattern_buffer (bufp, laststart, b);
+ if (count == -1) /* variable length */
+ FREE_STACK_RETURN (REG_BADPAT);
+
+ GET_BUFFER_SPACE (5);
+ INSERT_JUMP2 (lookaround == -1 ? lookbehind : lookbehind_not, laststart, b + 6, count);
+ b += 5;
+ }
+
+ /* Negative form. */
+ if (lookaround > 1 || lookaround < -1)
+ BUF_PUSH (lookaround_fail);
+ BUF_PUSH (lookaround_succeed);
+ }
+ }
break;
@@ -3957,10 +4077,16 @@ at_begline_loc_p (pattern, p, syntax)
/* After an alternative? */
|| (*prev == '|' && (syntax & RE_NO_BK_VBAR || prev_prev_backslash))
/* After a shy subexpression? */
- || ((syntax & RE_SHY_GROUPS) && prev - 2 >= pattern
- && prev[-1] == '?' && prev[-2] == '('
- && (syntax & RE_NO_BK_PARENS
- || (prev - 3 >= pattern && prev[-3] == '\\')));
+ || ((syntax & RE_SHY_GROUPS)
+ && ((prev - 2 >= pattern
+ && prev[-1] == '?' && prev[-2] == '('
+ && (syntax & RE_NO_BK_PARENS
+ || (prev - 3 >= pattern && prev[-3] == '\\')))
+ || (prev - 3 >= pattern
+ && (*prev == '=' || *prev == '!')
+ && prev[-1] == '<' && prev[-2] == '?' && prev[-3] == '('
+ && (syntax & RE_NO_BK_PARENS
+ || (prev - 4 >= pattern && prev[-4] == '\\')))));
}
@@ -4205,6 +4331,13 @@ analyse_first (p, pend, fastmap, multibyte)
}
break;
+ case lookahead:
+ case lookahead_not:
+ case lookbehind:
+ case lookbehind_not:
+ if (!fastmap) break;
+ return -1;
+
/* All cases after this match the empty string. These end with
`continue'. */
@@ -4829,7 +4962,7 @@ skip_noops (p, pend)
{
case start_memory:
case stop_memory:
- p += 2; break;
+ p += 2; break;
case no_op:
p += 1; break;
case jump:
@@ -4845,6 +4978,93 @@ skip_noops (p, pend)
return p;
}
+static int
+exact_chars_in_pattern_buffer (bufp, p, pend)
+ struct re_pattern_buffer *bufp;
+ re_char *p, *pend;
+{
+ int count = 0;
+ while (p < pend)
+ {
+ switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
+ {
+ case exactn:
+ {
+ int mcnt = *p++;
+ int buf_charlen;
+ while (mcnt > 0) {
+ STRING_CHAR_AND_LENGTH (p, buf_charlen);
+ p += buf_charlen;
+ mcnt -= buf_charlen;
+ count++;
+ }
+ }
+ break;
+ case start_memory:
+ case stop_memory:
+ p++;
+ break;
+#ifdef emacs
+ case categoryspec:
+ case notcategoryspec:
+#endif /* emacs */
+ case syntaxspec:
+ case notsyntaxspec:
+ p++;
+ case anychar:
+ count++;
+ break;
+
+ case charset:
+ case charset_not:
+ if (CHARSET_RANGE_TABLE_EXISTS_P (p - 1))
+ {
+ int mcnt;
+ p = CHARSET_RANGE_TABLE (p - 1);
+ EXTRACT_NUMBER_AND_INCR (mcnt, p);
+ p = CHARSET_RANGE_TABLE_END (p, mcnt);
+ }
+ else
+ p += 1 + CHARSET_BITMAP_SIZE (p - 1);
+ count++;
+ break;
+
+#ifdef emacs
+ case before_dot:
+ case at_dot:
+ case after_dot:
+#endif /* emacs */
+ case no_op:
+ case begline:
+ case endline:
+ case begbuf:
+ case endbuf:
+ case wordbound:
+ case notwordbound:
+ case wordbeg:
+ case wordend:
+ case symbeg:
+ case symend:
+ /* Zero width. */
+ continue;
+ case lookahead:
+ case lookahead_not:
+ case lookbehind:
+ case lookbehind_not:
+ /* Skip to lookaround_success. */
+ while (p < pend)
+ {
+ if ((re_opcode_t) *p++ == lookaround_succeed)
+ break;
+ }
+ break;
+ default:
+ return -1;
+ }
+ }
+ return count;
+}
+
/* Non-zero if "p1 matches something" implies "p2 fails". */
static int
mutually_exclusive_p (bufp, p1, p2)
@@ -5202,6 +5422,9 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
re_char **best_regstart, **best_regend;
#endif
+ /* Discard a saved register from the stack. */
+ boolean discard_saved_regs = 0;
+
/* Logically, this is `best_regend[0]'. But we don't want to have to
allocate space for that if we're not allocating space for anything
else (see below). Also, we never need info about register 0 for
@@ -5774,6 +5997,77 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
p += 1;
break;
+ case lookahead:
+ case lookahead_not:
+ DEBUG_PRINT1 ((re_opcode_t) *(p - 1) == lookahead ? "EXECUTING lookahead.\n" : "EXECUTING lookahead_not.\n");
+
+ p += 2;
+ PUSH_FAILURE_POINT (p - 3, d);
+ break;
+
+ case lookbehind:
+ case lookbehind_not:
+ {
+ int mcnt, count;
+ boolean not = (re_opcode_t) *(p - 1) != lookbehind;
+
+ EXTRACT_NUMBER_AND_INCR (mcnt, p);
+ EXTRACT_NUMBER_AND_INCR (count, p);
+
+ DEBUG_PRINT2 (not
+ ? "EXECUTING lookbehind_not %d.\n"
+ : "EXECUTING lookbehind %d.\n", count);
+
+ dfail = d;
+ while (d != string1 && count > 0)
+ {
+ if (d == string2)
+ {
+ if (!string1)
+ break;
+ d = end1;
+ dend = end_match_1;
+ }
+
+ if (target_multibyte)
+ {
+ re_char *dhead = (d >= string1 && d <= end1) ? string1 : string2;
+ PREV_CHAR_BOUNDARY (d, dhead);
+ }
+ else
+ d--;
+ count--;
+ }
+
+ if (count > 0)
+ {
+ if (not)
+ {
+ /* There is no enough string to match.
+ So just make it succeeded here. */
+ d = dfail;
+ p = p - 2 + mcnt;
+ break;
+ }
+ else
+ goto fail;
+ }
+
+ PUSH_FAILURE_POINT (p - 5, dfail);
+ }
+ break;
+
+ case lookaround_succeed:
+ DEBUG_PRINT1 ("EXECUTING lookaround_succeed.\n");
+
+ FINISH_LOOKAROUND();
+ break;
+
+ case lookaround_fail:
+ DEBUG_PRINT1 ("EXECUTING lookaround_fail.\n");
+
+ FINISH_LOOKAROUND();
+ goto fail;
/* \<digit> has been turned into a `duplicate' command which is
followed by the numeric value of <digit> as the register number. */
@@ -6415,12 +6709,16 @@ re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
case on_failure_jump_loop:
case on_failure_jump:
case succeed_n:
+ case lookahead_not:
+ case lookbehind_not:
d = str;
continue_failure_jump:
EXTRACT_NUMBER_AND_INCR (mcnt, pat);
p = pat + mcnt;
break;
+ case lookahead:
+ case lookbehind:
case no_op:
/* A special frame used for nastyloops. */
goto fail;
diff --git a/test/regexp-testsuite.el b/test/regexp-testsuite.el
new file mode 100644
index 00000000..c79d6fdb
--- /dev/null
+++ b/test/regexp-testsuite.el
@@ -0,0 +1,280 @@
+;; -*-coding:utf-8-*-
+
+(require 'cl)
+
+(defvar regexp-testsuite-success nil)
+
+(defmacro regexp-testsuite-test (name &rest form)
+ (declare (indent 1))
+ `(,(if noninteractive
+ 'princ-list
+ 'message)
+ (format "%s ... %s"
+ ,name
+ (condition-case nil
+ (if (progn ,@form)
+ 'ok
+ (setq regexp-testsuite-success nil)
+ 'fail)
+ (error (progn
+ (setq regexp-testsuite-success nil)
+ 'invalid))))))
+
+(defun regexp-testsuite-expect-invalid (regexp)
+ (regexp-testsuite-test (format "expect-invalid %S" regexp)
+ (condition-case nil
+ (prog1 nil (string-match regexp ""))
+ (error t))))
+
+(defun regexp-testsuite-expect-match (regexp string &optional group-number group-string)
+ (regexp-testsuite-test (format "expect-match %S %S" regexp string)
+ (and (string-match regexp string)
+ (if group-number
+ (equal (match-string group-number string) group-string)
+ t))))
+
+(defun regexp-testsuite-expect-not-match (regexp string)
+ (regexp-testsuite-test (format "expect-not-match %S %S" regexp string)
+ (not (string-match regexp string))))
+
+(defun regexp-testsuite-run ()
+ (interactive)
+ (setq regexp-testsuite-success t)
+ (regexp-testsuite-expect-match "\\(?=\\)" "")
+ (regexp-testsuite-expect-not-match "\\(?=a\\)" "")
+ (regexp-testsuite-expect-match "a\\(?=b\\)b" "ab")
+ (regexp-testsuite-expect-not-match "a\\(?=b\\)c" "ab")
+ (regexp-testsuite-expect-match "\\(?=a\\)a" "a")
+ (regexp-testsuite-expect-not-match "\\(?=b\\)a" "a")
+ (regexp-testsuite-expect-match "\\(?=^\\)a" "a")
+ (regexp-testsuite-expect-match "a\\(?=$\\)$" "a")
+ (regexp-testsuite-expect-match "a\\(?=\\)$" "a")
+ (regexp-testsuite-expect-match "a\\(?=.*c\\)b" "abc")
+ (regexp-testsuite-expect-not-match "a\\(?=.*d\\)b" "abc")
+ (regexp-testsuite-expect-match "a\\(?=b\\|c\\|d\\|e\\)" "ae")
+ (regexp-testsuite-expect-not-match "a\\(?=b\\|c\\|d\\|e\\)" "af")
+ (regexp-testsuite-expect-match "a\\(?=\\(b\\)\\)b" "ab" 1 "b")
+ (regexp-testsuite-expect-match "a\\(\\(?=b\\)\\)" "ab" 1 "")
+ (regexp-testsuite-expect-match "a\\(?=\\(b\\)\\)" "ab" 1 "b")
+ (regexp-testsuite-expect-match "\\(a\\(?=\\(b\\)\\)\\2\\)\\1" "abab" 1 "ab")
+ (regexp-testsuite-expect-not-match "\\(a\\)\\(?=\\(b\\)\\)\\1" "ab")
+ (regexp-testsuite-expect-match "\\(a\\(?=b\\(?=c\\)\\)\\)" "abc" 1 "a")
+ (regexp-testsuite-expect-not-match "\\(a\\(?=b\\(?=c\\)\\)\\)" "abd")
+ (regexp-testsuite-expect-not-match "\\(?!\\)" "")
+ (regexp-testsuite-expect-match "\\(?!a\\)" "")
+ (regexp-testsuite-expect-not-match "a\\(?!b\\)b" "ab")
+ (regexp-testsuite-expect-match "a\\(?!b\\)c" "ac")
+ (regexp-testsuite-expect-not-match "\\(?!a\\)a" "a")
+ (regexp-testsuite-expect-match "\\(?!b\\)a" "a")
+ (regexp-testsuite-expect-match "\\(?!^\\)a" "ba")
+ (regexp-testsuite-expect-not-match "\\(?!^\\)a" "a")
+ (regexp-testsuite-expect-not-match "a\\(?!$\\)$" "a")
+ (regexp-testsuite-expect-not-match "a\\(?!\\)$" "a")
+ (regexp-testsuite-expect-not-match "a\\(?!.*c\\)b" "abc")
+ (regexp-testsuite-expect-match "a\\(?!.*d\\)b" "abc")
+ (regexp-testsuite-expect-not-match "a\\(?!b\\|c\\|d\\|e\\)" "ae")
+ (regexp-testsuite-expect-match "a\\(?!b\\|c\\|d\\|e\\)" "af")
+ (regexp-testsuite-expect-match "a\\(?!\\(b\\)\\)c" "ac")
+ (regexp-testsuite-expect-match "a\\(\\(?!b\\)\\)" "ac")
+ (regexp-testsuite-expect-match "a\\(?!b\\(?!c\\)\\)" "abc")
+ (regexp-testsuite-expect-not-match "a\\(?!b\\(?=\\(c\\)\\)\\)" "abc")
+ (regexp-testsuite-expect-not-match "a\\(?!b\\(?!c\\)\\)" "abd")
+ (regexp-testsuite-expect-match "\\(?<=\\)" "")
+ (regexp-testsuite-expect-not-match "\\(?<=a\\)" "")
+ (regexp-testsuite-expect-match "\\(?<=a\\)" "a")
+ (regexp-testsuite-expect-not-match "\\(?<=b\\)" "a")
+ (regexp-testsuite-expect-match "\\(?<=^\\)" "")
+ (regexp-testsuite-expect-not-match "a\\(?<=^\\)" "")
+ (regexp-testsuite-expect-match "\\(?<=$\\)" "")
+ (regexp-testsuite-expect-not-match "\\(?<=$\\)a" "")
+ (regexp-testsuite-expect-match "\\(?<=a\\)b" "ab")
+ (regexp-testsuite-expect-not-match "\\(?<=c\\)b" "ab")
+ (regexp-testsuite-expect-match "\\(?<=\\(?<=a\\)\\)b" "ab")
+ (regexp-testsuite-expect-not-match "\\(?<=\\(?<=b\\)\\)b" "ab")
+ (regexp-testsuite-expect-match "\\(?<=\\(?=a\\).\\)b" "ab")
+ (regexp-testsuite-expect-match "\\(?<=\\(a\\)\\)b\\1" "aba" 1 "a")
+ (regexp-testsuite-expect-match "\\(?<=.\\)a" "aa")
+ (regexp-testsuite-expect-match "\\(?<=\\(.\\)\\)a" "aa")
+ (regexp-testsuite-expect-match "\\(?<=\\w\\)a" "aa")
+ (regexp-testsuite-expect-not-match "\\(?<=\\w\\)a" "!a")
+ (regexp-testsuite-expect-match "\\(?<=\\sw\\)a" "aa")
+ (regexp-testsuite-expect-not-match "\\(?<=\\sw\\)a" "!a")
+ (regexp-testsuite-expect-match "\\(?<=\\cg\\)a" "λa")
+ (regexp-testsuite-expect-not-match "\\(?<=\\Cg\\)a" "λa")
+ (regexp-testsuite-expect-match "\\(?<=[a-z]\\)" "aa")
+ (regexp-testsuite-expect-not-match "\\(?<=[a-z]\\)a" "1a")
+ (regexp-testsuite-expect-match "\\(?<=[^a-z]\\)" "1a")
+ (regexp-testsuite-expect-not-match "\\(?<=[^a-z]\\)" "aa")
+ (regexp-testsuite-expect-match "\\(?<=[:ascii:]\\)a" "aa")
+ (regexp-testsuite-expect-match "\\(?<=\\`\\)" "")
+ (regexp-testsuite-expect-not-match "a\\(?<=\\`\\)" "a")
+ (regexp-testsuite-expect-match "\\(?<=\\'\\)" "")
+ (regexp-testsuite-expect-not-match "\\(?<=\\'\\)a" "a")
+ (regexp-testsuite-expect-not-match "\\(?<=\\=\\)" "")
+ (regexp-testsuite-expect-match "\\(?<=\\b\\)a" "a")
+ (regexp-testsuite-expect-not-match "a\\(?<=\\b\\)b" "ab")
+ (regexp-testsuite-expect-match "\\(?<=\\B\\)a" "aa")
+ (regexp-testsuite-expect-not-match "\\(?<=\\B\\)a" " a")
+ (regexp-testsuite-expect-match "\\(?<=\\<\\)a" "a")
+ (regexp-testsuite-expect-not-match "a\\(?<=\\<\\)b" "ab")
+ (regexp-testsuite-expect-match "a\\(?<=\\>\\)" "a")
+ (regexp-testsuite-expect-not-match "a\\(?<=\\>\\)b" "ab")
+ (regexp-testsuite-expect-match "\\(?<=\\_<\\)a" "a")
+ (regexp-testsuite-expect-not-match "a\\(?<=\\_<\\)b" "ab")
+ (regexp-testsuite-expect-match "a\\(?<=\\_>\\)" "a")
+ (regexp-testsuite-expect-not-match "a\\(?<=\\_>\\)b" "ab")
+ (regexp-testsuite-expect-invalid "\\(?<=\\(.\\)\\1\\)") ; duplicate
+ (regexp-testsuite-expect-invalid "\\(?<=a*\\)") ; variable width
+ (regexp-testsuite-expect-invalid "\\(?<=a*?\\)") ; variable width
+ (regexp-testsuite-expect-invalid "\\(?<=a+\\)") ; variable width
+ (regexp-testsuite-expect-invalid "\\(?<=a+?\\)") ; variable width
+ (regexp-testsuite-expect-invalid "\\(?<=a?\\)") ; variable width
+ (regexp-testsuite-expect-invalid "\\(?<=a??\\)") ; variable width
+ (regexp-testsuite-expect-invalid "\\(?<=a\\{1,4\\}\\)") ; variable width
+ (regexp-testsuite-expect-invalid "\\(?<=a\\|bb\\|ccc\\)") ; variable width
+ (regexp-testsuite-expect-invalid "\\(?<=a\\{4\\}\\)") ; fixed width but not supported yet
+ (regexp-testsuite-expect-invalid "\\(?<=a\\|\\b\\c\\)") ; fixed width but not supported yet
+ (regexp-testsuite-expect-not-match "\\(?<!\\)" "")
+ (regexp-testsuite-expect-match "\\(?<!a\\)" "")
+ (regexp-testsuite-expect-match "\\(?<!a\\)" "a")
+ (regexp-testsuite-expect-not-match "\\(?<!a\\)b" "ab")
+ (regexp-testsuite-expect-match "\\(?<!b\\)" "a")
+ (regexp-testsuite-expect-not-match "\\(?<!^\\)" "")
+ (regexp-testsuite-expect-not-match "a\\(?<!^\\)" "")
+ (regexp-testsuite-expect-not-match "\\(?<!$\\)" "")
+ (regexp-testsuite-expect-match "\\(?<=a\\)b" "ab")
+ (regexp-testsuite-expect-match "\\(?<!c\\)b" "ab")
+ (regexp-testsuite-expect-match "\\(?<!\\(?<!a\\)\\)b" "ab")
+ (regexp-testsuite-expect-not-match "\\(?<!\\(?<!b\\)\\)b" "ab")
+ (regexp-testsuite-expect-match "\\(?<!\\(?!a\\).\\)b" "ab")
+ (regexp-testsuite-expect-match "\\(?<!.\\)a" "aa")
+ (regexp-testsuite-expect-not-match "\\(?<!.\\)b" "ab")
+ (regexp-testsuite-expect-not-match "\\(?<!\\(.\\)\\)b" "ab")
+ (regexp-testsuite-expect-not-match "\\(?<!\\w\\)b" "ab")
+ (regexp-testsuite-expect-not-match "\\(?<!\\w\\)b" "ab")
+ (regexp-testsuite-expect-not-match "\\(?<!\\sw\\)b" "ab")
+ (regexp-testsuite-expect-match "\\(?<!\\sw\\)a" "!a")
+ (regexp-testsuite-expect-not-match "\\(?<!\\cg\\)a" "λa")
+ (regexp-testsuite-expect-match "\\(?<!\\Cg\\)a" "λa")
+ (regexp-testsuite-expect-match "\\(?<![a-z]\\)" "aa")
+ (regexp-testsuite-expect-match "\\(?<![a-z]\\)a" "1a")
+ (regexp-testsuite-expect-not-match "\\(?<![^a-z]\\)a" "1a")
+ (regexp-testsuite-expect-not-match "\\(?<![:ascii:]\\)b" "ab")
+ (regexp-testsuite-expect-not-match "\\(?<!\\`\\)" "")
+ (regexp-testsuite-expect-match "a\\(?<!\\`\\)" "a")
+ (regexp-testsuite-expect-not-match "\\(?<!\\'\\)" "")
+ (regexp-testsuite-expect-match "\\(?<!\\'\\)a" "a")
+ (regexp-testsuite-expect-match "\\(?<!\\=\\)" "")
+ (regexp-testsuite-expect-not-match "\\(?<!\\b\\)a" "a")
+ (regexp-testsuite-expect-match "a\\(?<!\\b\\)b" "ab")
+ (regexp-testsuite-expect-not-match "\\(?<!\\B\\)b" "ab")
+ (regexp-testsuite-expect-match "\\(?<!\\B\\)a" " a")
+ (regexp-testsuite-expect-not-match "\\(?<!\\<\\)a" "a")
+ (regexp-testsuite-expect-match "a\\(?<!\\<\\)b" "ab")
+ (regexp-testsuite-expect-not-match "a\\(?<!\\>\\)" "a")
+ (regexp-testsuite-expect-match "a\\(?<!\\>\\)b" "ab")
+ (regexp-testsuite-expect-not-match "\\(?<!\\_<\\)a" "a")
+ (regexp-testsuite-expect-match "a\\(?<!\\_<\\)b" "ab")
+ (regexp-testsuite-expect-not-match "a\\(?<!\\_>\\)" "a")
+ (regexp-testsuite-expect-match "a\\(?<!\\_>\\)b" "ab")
+ (regexp-testsuite-expect-invalid "\\(?<!\\(.\\)\\1\\)") ; duplicate
+ (regexp-testsuite-expect-invalid "\\(?<!a*\\)") ; variable width
+ (regexp-testsuite-expect-invalid "\\(?<!a*?\\)") ; variable width
+ (regexp-testsuite-expect-invalid "\\(?<!a+\\)") ; variable width
+ (regexp-testsuite-expect-invalid "\\(?<!a+?\\)") ; variable width
+ (regexp-testsuite-expect-invalid "\\(?<!a?\\)") ; variable width
+ (regexp-testsuite-expect-invalid "\\(?<!a??\\)") ; variable width
+ (regexp-testsuite-expect-invalid "\\(?<!a\\{1,4\\}\\)") ; variable width
+ (regexp-testsuite-expect-invalid "\\(?<!a\\|bb\\|ccc\\)") ; variable width
+ (regexp-testsuite-expect-invalid "\\(?<!a\\{4\\}\\)") ; fixed width but not supported yet
+ (regexp-testsuite-expect-invalid "\\(?<!a\\|\\b\\c\\)") ; fixed width but not supported yet
+
+ (regexp-testsuite-expect-match "Hello, \\(?=世界\\)" "Hello, 世界!")
+ (regexp-testsuite-expect-not-match "Hello, \\(?=せかい\\)" "Hello, 世界!")
+ (regexp-testsuite-expect-match "Hello, \\(?!せかい\\)" "Hello, 世界!")
+ (regexp-testsuite-expect-not-match "Hello, \\(?!世界\\)" "Hello, 世界!")
+ (regexp-testsuite-expect-match "\\(?<=こんにちは\\), World!" "こんにちは, World!")
+ (regexp-testsuite-expect-not-match "\\(?<=こんにちわ\\), World!" "こんにちは, World!")
+ (regexp-testsuite-expect-match "\\(?<!こんにちわ\\), World!" "こんにちは, World!")
+ (regexp-testsuite-expect-not-match "\\(?<!こんにちは\\), World!" "こんにちは, World!")
+
+ (with-temp-buffer
+ (insert "abracadabra")
+ (goto-char (point-min))
+ (regexp-testsuite-test "re-search-forward lookahead"
+ (equal
+ (loop while (re-search-forward "a\\(?=b\\)" nil t)
+ collect (point))
+ '(2 9))))
+
+ (with-temp-buffer
+ (insert "abracadabra")
+ (regexp-testsuite-test "re-search-backward lookahead"
+ (equal
+ (loop while (re-search-backward "a\\(?=b\\)" nil t)
+ collect (point))
+ '(8 1))))
+
+ (with-temp-buffer
+ (insert "abracadabra")
+ (goto-char (point-min))
+ (regexp-testsuite-test "re-search-forward lookbehind"
+ (equal
+ (loop while (re-search-forward "\\(?<=a\\)b" nil t)
+ collect (point))
+ '(3 10))))
+
+ (with-temp-buffer
+ (insert "abracadabra")
+ (regexp-testsuite-test "re-search-backward lookbehind"
+ (equal
+ (loop while (re-search-backward "\\(?<=a\\)b" nil t)
+ collect (point))
+ '(9 2))))
+
+ (with-temp-buffer
+ (insert "abcdebc")
+ (goto-char 3)
+ (regexp-testsuite-test "re-search-backward lookbehind 2"
+ (eq (re-search-forward "\\(?<=b\\)c" nil t) 4)))
+
+ (with-temp-buffer
+ (insert "abcdebc")
+ (goto-char 7)
+ ;; search-backward with lookahead over bound is not supported yet
+ (regexp-testsuite-test "re-search-backward not supported"
+ (eq (re-search-backward "b\\(?=c\\)" nil t) 2)))
+
+ (if regexp-testsuite-success
+ (message "Test success!")
+ (message "Test failed.")))
+
+(defun regexp-testsuite-benchmark (file)
+ (interactive (list (read-file-name "Large file: "
+ nil
+ (progn
+ (require 'find-func)
+ (let ((file (concat (or find-function-C-source-directory "~/src/emacs") "/src/xdisp.c")))
+ (if (file-exists-p file)
+ file))))))
+ (require 'benchmark)
+ (let (count)
+ (with-temp-buffer
+ (insert-file-contents file)
+ (dolist (pair '((point-min . re-search-forward) (point-max . re-search-backward)))
+ (dolist (regexp '("unsigned \\(?:char\\|int\\|long\\)" "unsigned \\(?=char\\|int\\|long\\)"
+ "\\(?:unsigned \\)int" "\\(?<=unsigned \\)int"))
+ (setq count 0)
+ (funcall (if noninteractive
+ 'princ-list
+ 'message)
+ (format "%s: %s elapsed (%s found)"
+ regexp
+ (car (benchmark-run 10
+ (progn
+ (goto-char (funcall (car pair)))
+ (while (funcall (cdr pair) regexp nil t)
+ (setq count (1+ count))))))
+ count)))))))
+
+(provide 'regexp-testsuite)
--
1.7.8
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment