Skip to content

Instantly share code, notes, and snippets.

@wasamasa
Created July 29, 2015 09:50
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save wasamasa/7d7983f66e43c8907e03 to your computer and use it in GitHub Desktop.
Save wasamasa/7d7983f66e43c8907e03 to your computer and use it in GitHub Desktop.
;; xr - convert string regexp to rx notation
(require 'rx)
(defun xr-parse-char-alt ()
(let ((set nil))
(when (looking-at "]")
(forward-char 1)
(setq set (list "]")))
(while (not (looking-at "]"))
(cond
;; character class
((looking-at (rx "[:" (group (one-or-more letter)) ":]"))
(let* ((sym (intern (match-string 1)))
(rx-sym (cond ((eq sym 'unibyte) 'ascii)
((eq sym 'multibyte) 'nonascii)
(t sym))))
(setq set (cons sym set))
(goto-char (match-end 0))))
;; character range
((looking-at (rx (not (any "]")) "-" (not (any "]"))))
(let ((range (match-string 0)))
;; We render [0-9] as (any "0-9") instead of (any (?0 . ?9))
;; for readability and brevity, and because the latter would
;; become (48 . 57) when printed.
(setq set (cons range set))
(goto-char (match-end 0))))
((looking-at (rx eos))
(error "unterminated character alternative"))
;; plain character (including ^ or -)
(t
(setq set (cons (char-to-string (following-char)) set))
(forward-char 1))))
;; FIXME: combine several characters into one string (if there is no "-"),
;; like (any "a" "b") -> (any "ab")
set))
;; Reverse a sequence and concatenate adjacent strings.
(defun xr-rev-join-seq (rev-seq)
(let ((seq nil))
(while rev-seq
(if (and (stringp (car rev-seq)) (stringp (car seq)))
(setq seq (cons (concat (car rev-seq) (car seq)) (cdr seq)))
(setq seq (cons (car rev-seq) seq)))
(setq rev-seq (cdr rev-seq)))
seq))
(defun xr-parse-seq ()
(let ((sequence nil)) ; reversed
(while (not (looking-at (rx (or "\\|" "\\)" eos))))
(cond
;; nonspecial character
((looking-at (rx (not (any "\\*+?.^$["))))
(forward-char 1)
(setq sequence (cons (match-string 0) sequence)))
;; escaped special
((looking-at (rx "\\" (group (any "\\*+?.^$["))))
(forward-char 2)
(setq sequence (cons (match-string 1) sequence)))
;; group
((looking-at (rx "\\("
(opt (group "?" (group (zero-or-more digit)) ":"))))
(let ((question (match-string 1))
(number (match-string 2))
(end (match-end 0)))
(goto-char end)
(let* ((group (xr-parse-alt))
;; optimise - group has an implicit seq
(operand (if (and (listp group) (eq (car group) 'seq))
(cdr group)
(list group))))
(when (not (looking-at (rx "\\)")))
(error "missing \\)"))
(forward-char 2)
(let ((item (cond ((not question) ; plain subgroup
(cons 'group operand))
((zerop (length number)) ; shy group
group)
(t
(append (list 'group-n (string-to-number number))
operand)))))
(setq sequence (cons item sequence))))))
;; * ? + (and non-greedy variants)
((looking-at (rx (group (any "*?+")) (opt (group "?"))))
(let ((op (match-string 1))
(non-greedy (match-string 2)))
(goto-char (match-end 0))
(when (null sequence)
(error "postfix operator without operand"))
;; While we could use the same symbols as the operator in the regexp,
;; ? needs to be escaped in symbols and isn't very neat, so we
;; assume that rx-greedy-flag is set.
(let* ((sym (cdr (assoc op '(("*" . zero-or-more)
("+" . one-or-more)
("?" . opt)))))
(operand (car sequence))
;; Optimise when the operand is (seq ...)
(item
(if (and (listp operand) (eq (car operand) 'seq))
(cons sym (cdr operand))
(list sym operand))))
;; BUG: minimal-match affects everything inside, which is not
;; what we want. Either keep track of the stuff inside and insert
;; maximal-match as appropriate (messy!) or just use the
;; *?, ?? and +? symbols.
(setq sequence (cons (if non-greedy (list 'minimal-match item) item)
(cdr sequence))))))
;; \{..\}
((looking-at (rx "\\{" (or (group (one-or-more digit))
(seq
(opt (group (one-or-more digit)))
","
(opt (group (one-or-more digit)))))
"\\}"))
(when (null sequence)
(error "repetition without operand"))
(let ((exactly (match-string 1))
(lower (match-string 2))
(upper (match-string 3)))
(goto-char (match-end 0))
(let ((op (cond (exactly
(list '= (string-to-number exactly)))
((and lower upper)
(list 'repeat
(string-to-number lower)
(string-to-number upper)))
(lower
(list '>= (string-to-number lower)))
(upper
(list 'repeat 0 (string-to-number upper)))
(t
(list 'zero-or-more)))))
(setq sequence (cons (append op (list (car sequence)))
(cdr sequence))))))
;; character alternative
((looking-at (rx "[" (opt (group "^"))))
(goto-char (match-end 0))
;; FIXME: optimise (any digit) -> digit etc
(let* ((negated (match-string 1))
(set (cons 'any (xr-parse-char-alt))))
(forward-char 1)
(setq sequence (cons (if negated (list 'not set) set) sequence))))
;; backref
((looking-at (rx "\\" (group digit)))
(forward-char 2)
(setq sequence
(cons (list 'backref (string-to-number (match-string 1)))
sequence)))
;; various simple substitutions
((looking-at (rx (or "." "$" "^" "\\w" "\\W" "\\`" "\\'" "\\="
"\\b" "\\B" "\\<" "\\>" "\\_<" "\\_>")))
(goto-char (match-end 0))
(let ((sym (cdr (assoc
(match-string 0)
'(("." . nonl)
("^" . bol) ("$" . eol)
("\\w" . wordchar) ("\\W" . not-wordchar)
("\\`" . bos) ("\\'" . eos)
("\\=" . point)
("\\b" . word-boundary) ("\\B" . not-word-boundary)
("\\<" . bow) ("\\>" . eow)
("\\_<" . symbol-start) ("\\_>" . symbol-end))))))
(setq sequence (cons sym sequence))))
;; character syntax
((looking-at (rx "\\" (group (any "sS")) (group anything)))
(let ((negated (string-equal (match-string 1) "S"))
(syntax-code (match-string 2)))
(goto-char (match-end 0))
(let ((sym (assoc syntax-code
'(("-" . whitespace)
("." . punctuation)
("w" . word)
("_" . symbol)
("(" . open-parenthesis)
(")" . close-parenthesis)
("'" . expression-prefix)
("\"" . string-quote)
("$" . paired-delimiter)
("\\" . escape)
("/" . character-quote)
("<" . comment-start)
(">" . comment-end)
("|" . string-delimiter)
("!" . comment-delimiter)))))
(when (not sym)
(error "unknown syntax code: %s" syntax-code))
(let ((item (list 'syntax (cdr sym))))
(setq sequence (cons (if negated (list 'not item) item)
sequence))))))
;; character categories
((looking-at (rx "\\" (group (any "cC")) (group anything)))
(let ((negated (string-equal (match-string 1) "C"))
(category-code (match-string 2)))
(goto-char (match-end 0))
(let ((sym (assoc category-code
'(("0" . consonant)
("1" . base-vowel)
("2" . upper-diacritical-mark)
("3" . lower-diacritical-mark)
("4" . tone-mark)
("5" . symbol)
("6" . digit)
("7" . vowel-modifying-diacritical-mark)
("8" . vowel-sign)
("9" . semivowel-lower)
("<" . not-at-end-of-line)
(">" . not-at-beginning-of-line)
("A" . alpha-numeric-two-byte)
("C" . chinse-two-byte)
("G" . greek-two-byte)
("H" . japanese-hiragana-two-byte)
("I" . indian-tow-byte)
("K" . japanese-katakana-two-byte)
("N" . korean-hangul-two-byte)
("Y" . cyrillic-two-byte)
("^" . combining-diacritic)
("a" . ascii)
("b" . arabic)
("c" . chinese)
("e" . ethiopic)
("g" . greek)
("h" . korean)
("i" . indian)
("j" . japanese)
("k" . japanese-katakana)
("l" . latin)
("o" . lao)
("q" . tibetan)
("r" . japanese-roman)
("t" . thai)
("v" . vietnamese)
("w" . hebrew)
("y" . cyrillic)
("|" . can-break)))))
(when (not sym)
(error "unknown category code: %s" category-code))
(let ((item (list 'category (cdr sym))))
(setq sequence (cons (if negated (list 'not item) item)
sequence))))))
;; error
(t
(let* ((start (point))
(end (min (+ start 3) (point-max))))
(error "syntax error: %s" (buffer-substring start end))))))
(let ((item-seq (xr-rev-join-seq sequence)))
(if (> (length item-seq) 1)
(cons 'seq item-seq)
(car item-seq)))))
(defun xr-parse-alt ()
(let ((alternatives nil)) ; reversed
(while (not (looking-at (rx (or "\\)" eos))))
(setq alternatives (cons (xr-parse-seq) alternatives))
(when (looking-at (rx "\\|"))
(forward-char 2)))
(if (> (length alternatives) 1)
(cons 'or (reverse alternatives))
(car alternatives))))
(defun xr (re-string)
"Convert a regexp string to rx notation."
(with-temp-buffer
(insert re-string)
(goto-char (point-min))
(let ((rx (xr-parse-alt)))
(when (looking-at (rx "\\)"))
(error "unbalanced \\)"))
rx)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment