Skip to content

Instantly share code, notes, and snippets.

@snippins
Last active August 7, 2022 10:49
Show Gist options
  • Save snippins/2d9a66e6708bc258f662babe0a1c1009 to your computer and use it in GitHub Desktop.
Save snippins/2d9a66e6708bc258f662babe0a1c1009 to your computer and use it in GitHub Desktop.
Emacs 29 patch for regex lookaround - rev b9c65203d0f419306ac062e59a59643db9a1a541 - 2022-08-06 master
diff --git a/src/regex-emacs.c b/src/regex-emacs.c
index 9b2c14c..015451d 100644
--- a/src/regex-emacs.c
+++ b/src/regex-emacs.c
@@ -43,6 +43,8 @@
# undef RE_DUP_MAX
#endif
#define RE_DUP_MAX (0xffff)
+#define PREV_CHAR_BOUNDARY(p, limit) ((p)--)
+#define STRING_CHAR_AND_LENGTH(p, actual_len) ((actual_len) = 1, *(p))
/* Make syntax table lookup grant data in gl_state. */
#define SYNTAX(c) syntax_property (c, 1)
@@ -321,7 +323,16 @@ #define BYTEWIDTH 8 /* In bits. */
/* Matches any character whose syntax is not that specified. */
notsyntaxspec,
+ lookahead,
+ lookahead_not,
+ lookbehind,
+ lookbehind_not,
+ lookaround_succeed,
+ lookaround_fail,
+
+ before_dot, /* Succeeds if before point. */
at_dot, /* Succeeds if at point. */
+ after_dot, /* Succeeds if after point. */
/* Matches any character whose category-set contains the specified
category. The operator is followed by a byte which contains a
@@ -542,6 +553,32 @@ print_partial_compiled_pattern (re_char *start, re_char *end)
fprintf (stderr, "/stop_memory/%d", *p++);
break;
+ case lookahead:
+ extract_number_and_incr (&mcnt, &p);
+ fprintf (stderr, "/lookahead/%d", mcnt);
+ break;
+ case lookahead_not:
+ extract_number_and_incr (&mcnt, &p);
+ fprintf (stderr, "/lookahead_not/%d", mcnt);
+ break;
+ case lookbehind:
+ extract_number_and_incr (&mcnt, &p);
+ extract_number_and_incr (&mcnt2, &p);
+ fprintf (stderr, "/lookbehind/%d/%d", mcnt, mcnt2);
+ break;
+ case lookbehind_not:
+ extract_number_and_incr (&mcnt, &p);
+ extract_number_and_incr (&mcnt2, &p);
+ fprintf (stderr, "/lookbehind_not/%d/%d", mcnt, mcnt2);
+ break;
+ case lookaround_succeed:
+ fprintf (stderr, "/lookaround_succeed");
+ break;
+ case lookaround_fail:
+ fprintf (stderr, "/lookaround_fail");
+ break;
+
+
case duplicate:
fprintf (stderr, "/duplicate/%d", *p++);
break;
@@ -706,10 +743,18 @@ print_partial_compiled_pattern (re_char *start, re_char *end)
fprintf (stderr, "/%d", mcnt);
break;
+ case before_dot:
+ fprintf (stderr, "/before_dot");
+ break;
+
case at_dot:
fputs ("/at_dot", stderr);
break;
+ case after_dot:
+ fprintf (stderr, "/after_dot");
+ break;
+
case categoryspec:
fputs ("/categoryspec", stderr);
mcnt = *p++;
@@ -1146,6 +1191,30 @@ #define POP_FAILURE_POINT(str, pat) \
DEBUG_STATEMENT (nfailure_points_popped++); \
} while (false) /* POP_FAILURE_POINT */
+#define FINISH_LOOKAROUND() \
+ do { \
+ re_char *str, *pat; \
+ re_opcode_t op; \
+ discard_saved_regs = 1; \
+ while (!FAIL_STACK_EMPTY ()) \
+ { \
+ POP_FAILURE_POINT (str, pat); \
+ op = (re_opcode_t) *pat; \
+ if (op == lookahead \
+ || op == lookahead_not \
+ || op == lookbehind \
+ || op == lookbehind_not) \
+ { \
+ d = str; \
+ dend = ((d >= string1 && d <= end1) \
+ ? end_match_1 : end_match_2); \
+ break; \
+ } \
+ } \
+ discard_saved_regs = 0; \
+ } while (0);
+
+
/* Registers are set to a sentinel when they haven't yet matched. */
@@ -1285,6 +1354,7 @@ verify (LONG_MIN <= -(MAX_BUF_SIZE - 1) && MAX_BUF_SIZE - 1 <= LONG_MAX);
pattern_offset_t fixup_alt_jump;
pattern_offset_t laststart_offset;
regnum_t regnum;
+ int lookaround;
} compile_stack_elt_t;
@@ -1656,6 +1726,7 @@ extend_range_table_work_area (struct range_table_work_area *work_area)
/* regex_compile and helpers. */
static bool group_in_compile_stack (compile_stack_type, regnum_t);
+static int exact_chars_in_pattern_buffer (struct re_pattern_buffer *bufp, re_char *p, re_char *pend);
/* Insert the 'jump' from the end of last alternative to "here".
The space for the jump has already been allocated. */
@@ -2188,6 +2259,7 @@ regex_compile (re_char *pattern, ptrdiff_t size,
case '(':
{
bool shy = false;
+ int lookaround = 0;
regnum_t regnum = 0;
if (p+1 < pend)
{
@@ -2214,6 +2286,28 @@ regex_compile (re_char *pattern, ptrdiff_t size,
&regnum))
FREE_STACK_RETURN (REG_ESIZE);
break;
+ case '=':
+ /* Positive lookahead assertion. */
+ shy = lookaround = 1;
+ break;
+ case '!':
+ /* Negative lookahead assertion. */
+ shy = lookaround = 2;
+ break;
+ case '<':
+ {
+ PATFETCH (c);
+ if (c == '=')
+ /* Positive lookbehind assertion. */
+ shy = lookaround = -1;
+ else if (c == '!')
+ /* Negative lookbehind assertion. */
+ shy = lookaround = -2;
+ else
+ FREE_STACK_RETURN (REG_BADPAT);
+ }
+ break;
+
default:
/* Only (?:...) is supported right now. */
FREE_STACK_RETURN (REG_BADPAT);
@@ -2256,6 +2350,7 @@ regex_compile (re_char *pattern, ptrdiff_t size,
= fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
COMPILE_STACK_TOP.regnum = regnum;
+ COMPILE_STACK_TOP.lookaround = lookaround;
/* Do not push a start_memory for groups beyond the last one
we can represent in the compiled pattern. */
@@ -2292,6 +2387,7 @@ regex_compile (re_char *pattern, ptrdiff_t size,
later groups should continue to be numbered higher,
as in '(ab)c(de)' -- the second group is #2. */
regnum_t regnum;
+ int lookaround;
compile_stack.avail--;
begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
@@ -2304,13 +2400,40 @@ regex_compile (re_char *pattern, ptrdiff_t size,
/* If we've reached MAX_REGNUM groups, then this open
won't actually generate any code, so we'll have to
clear pending_exact explicitly. */
+ lookaround = COMPILE_STACK_TOP.lookaround;
pending_exact = 0;
/* We're at the end of the group, so now we know how many
groups were inside this one. */
if (regnum <= MAX_REGNUM && regnum > 0)
BUF_PUSH_2 (stop_memory, regnum);
- }
+ else if (lookaround)
+ {
+ if (lookaround > 0)
+ {
+ /* Positive/negative lookahead assertion. */
+ GET_BUFFER_SPACE (3);
+ INSERT_JUMP (lookaround == 1 ? lookahead : lookahead_not, laststart, b + 4);
+ b += 3;
+ }
+ else
+ {
+ /* Positive/negative lookbehind assertion. */
+ int count = exact_chars_in_pattern_buffer (bufp, laststart, b);
+ if (count == -1) /* variable length */
+ FREE_STACK_RETURN (REG_BADPAT);
+
+ GET_BUFFER_SPACE (5);
+ INSERT_JUMP2 (lookaround == -1 ? lookbehind : lookbehind_not, laststart, b + 6, count);
+ b += 5;
+ }
+
+ /* Negative form. */
+ if (lookaround > 1 || lookaround < -1)
+ BUF_PUSH (lookaround_fail);
+ BUF_PUSH (lookaround_succeed);
+ }
+ }
break;
@@ -2995,11 +3118,19 @@ analyze_first (re_char *p, re_char *pend, char *fastmap, bool multibyte)
match_any_multibyte_characters = true;
}
break;
+ case lookahead:
+ case lookahead_not:
+ case lookbehind:
+ case lookbehind_not:
+ if (!fastmap) break;
+ return -1;
/* All cases after this match the empty string. These end with
'continue'. */
+ case before_dot:
case at_dot:
+ case after_dot:
case no_op:
case begline:
case endline:
@@ -3573,6 +3704,96 @@ skip_noops (re_char *p, re_char *pend)
return p;
}
+
+static int
+exact_chars_in_pattern_buffer (bufp, p, pend)
+ struct re_pattern_buffer *bufp;
+ re_char *p, *pend;
+{
+ int count = 0;
+ while (p < pend)
+ {
+ switch (*p++)
+ {
+ case exactn:
+ {
+ int mcnt = *p++;
+ int buf_charlen;
+ while (mcnt > 0) {
+ STRING_CHAR_AND_LENGTH (p, buf_charlen);
+ p += buf_charlen;
+ mcnt -= buf_charlen;
+ count++;
+ }
+ }
+ break;
+ case start_memory:
+ case stop_memory:
+ p++;
+ break;
+#ifdef emacs
+ case categoryspec:
+ case notcategoryspec:
+#endif /* emacs */
+ case syntaxspec:
+ case notsyntaxspec:
+ p++;
+ case anychar:
+ count++;
+ break;
+
+ case charset:
+ case charset_not:
+ if (CHARSET_RANGE_TABLE_EXISTS_P (p - 1))
+ {
+ int mcnt;
+ p = CHARSET_RANGE_TABLE (p - 1);
+ EXTRACT_NUMBER_AND_INCR (mcnt, p);
+ p = CHARSET_RANGE_TABLE_END (p, mcnt);
+ }
+ else
+ p += 1 + CHARSET_BITMAP_SIZE (p - 1);
+ count++;
+ break;
+
+#ifdef emacs
+ case before_dot:
+ case at_dot:
+ case after_dot:
+#endif /* emacs */
+ case no_op:
+ case begline:
+ case endline:
+ case begbuf:
+ case endbuf:
+ case wordbound:
+ case notwordbound:
+ case wordbeg:
+ case wordend:
+ case symbeg:
+ case symend:
+ /* Zero width. */
+ continue;
+ case lookahead:
+ case lookahead_not:
+ case lookbehind:
+ case lookbehind_not:
+ /* Skip to lookaround_success. */
+ while (p < pend)
+ {
+ if ((re_opcode_t) *p++ == lookaround_succeed)
+ break;
+ }
+ break;
+ default:
+ return -1;
+ }
+ }
+ return count;
+}
+
+
+
/* Test if C matches charset op. *PP points to the charset or charset_not
opcode. When the function finishes, *PP will be advanced past that opcode.
C is character to test (possibly after translations) and CORIG is original
@@ -3944,6 +4165,9 @@ re_match_2_internal (struct re_pattern_buffer *bufp,
bool best_regs_set = false;
re_char **best_regstart UNINIT, **best_regend UNINIT;
+ /* Discard a saved register from the stack. */
+ bool discard_saved_regs = 0;
+
/* Logically, this is 'best_regend[0]'. But we don't want to have to
allocate space for that if we're not allocating space for anything
else (see below). Also, we never need info about register 0 for
@@ -4438,6 +4662,77 @@ re_match_2_internal (struct re_pattern_buffer *bufp,
p += 1;
break;
+ case lookahead:
+ case lookahead_not:
+ DEBUG_PRINT ((re_opcode_t) *(p - 1) == lookahead ? "EXECUTING lookahead.\n" : "EXECUTING lookahead_not.\n");
+
+ p += 2;
+ PUSH_FAILURE_POINT (p - 3, d);
+ break;
+
+ case lookbehind:
+ case lookbehind_not:
+ {
+ int mcnt, count_local;
+ bool not = (re_opcode_t) *(p - 1) != lookbehind;
+
+ EXTRACT_NUMBER_AND_INCR (mcnt, p);
+ EXTRACT_NUMBER_AND_INCR (count_local, p);
+
+ DEBUG_PRINT (not
+ ? "EXECUTING lookbehind_not %d.\n"
+ : "EXECUTING lookbehind %d.\n", count);
+
+ dfail = d;
+ while (d != string1 && count_local > 0)
+ {
+ if (d == string2)
+ {
+ if (!string1)
+ break;
+ d = end1;
+ dend = end_match_1;
+ }
+
+ if (target_multibyte)
+ {
+ re_char *dhead = (d >= string1 && d <= end1) ? string1 : string2;
+ PREV_CHAR_BOUNDARY (d, dhead);
+ }
+ else
+ d--;
+ count_local--;
+ }
+
+ if (count_local > 0)
+ {
+ if (not)
+ {
+ /* There is no enough string to match.
+ So just make it succeeded here. */
+ d = dfail;
+ p = p - 2 + mcnt;
+ break;
+ }
+ else
+ goto fail;
+ }
+
+ PUSH_FAILURE_POINT (p - 5, dfail);
+ }
+ break;
+
+ case lookaround_succeed:
+ DEBUG_PRINT ("EXECUTING lookaround_succeed.\n");
+
+ FINISH_LOOKAROUND();
+ break;
+
+ case lookaround_fail:
+ DEBUG_PRINT ("EXECUTING lookaround_fail.\n");
+
+ FINISH_LOOKAROUND();
+ goto fail;
/* \<digit> has been turned into a 'duplicate' command which is
followed by the numeric value of <digit> as the register number. */
@@ -5003,12 +5298,24 @@ re_match_2_internal (struct re_pattern_buffer *bufp,
}
break;
+ case before_dot:
+ DEBUG_PRINT ("EXECUTING before_dot.\n");
+ if (PTR_BYTE_POS (d) >= PT_BYTE)
+ goto fail;
+ break;
+
case at_dot:
DEBUG_PRINT ("EXECUTING at_dot.\n");
if (PTR_BYTE_POS (d) != PT_BYTE)
goto fail;
break;
+ case after_dot:
+ DEBUG_PRINT ("EXECUTING after_dot.\n");
+ if (PTR_BYTE_POS (d) <= PT_BYTE)
+ goto fail;
+ break;
+
case categoryspec:
case notcategoryspec:
{
@@ -5058,12 +5365,16 @@ re_match_2_internal (struct re_pattern_buffer *bufp,
case on_failure_jump_loop:
case on_failure_jump:
case succeed_n:
+ case lookahead_not:
+ case lookbehind_not:
d = str;
continue_failure_jump:
EXTRACT_NUMBER_AND_INCR (mcnt, pat);
p = pat + mcnt;
break;
+ case lookahead:
+ case lookbehind:
case no_op:
/* A special frame used for nastyloops. */
goto fail;
@snippins
Copy link
Author

snippins commented Aug 8, 2021

Adapted for Emacs 29 from this:

https://lists.gnu.org/archive/html/emacs-devel/2009-06/msg00094.html

Spec:

  • Any pattern is allowed in lookahead assertion.
  • Nested lookaround assertion is allowed.
  • Capturing is allowed only in positive lookahead/lookbehind assertion.
  • Duplication is allowed after such assertion.
  • Variable length pattern is NOT yet allowed in lookbehind assertion.
    [x] (?<=[0-9]+)MB
    [o] (?<=[0-9][0-9][0-9][0-9])MB
  • Lookahead assertion over start bound is not allowed in re-search-backward.
    (re-search-backward "(?<=a)b") for buffer "abca_|_b"
    will seek to first "ab".

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment