Created
July 29, 2015 09:50
-
-
Save wasamasa/7d7983f66e43c8907e03 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
;; xr - convert string regexp to rx notation | |
(require 'rx) | |
(defun xr-parse-char-alt () | |
(let ((set nil)) | |
(when (looking-at "]") | |
(forward-char 1) | |
(setq set (list "]"))) | |
(while (not (looking-at "]")) | |
(cond | |
;; character class | |
((looking-at (rx "[:" (group (one-or-more letter)) ":]")) | |
(let* ((sym (intern (match-string 1))) | |
(rx-sym (cond ((eq sym 'unibyte) 'ascii) | |
((eq sym 'multibyte) 'nonascii) | |
(t sym)))) | |
(setq set (cons sym set)) | |
(goto-char (match-end 0)))) | |
;; character range | |
((looking-at (rx (not (any "]")) "-" (not (any "]")))) | |
(let ((range (match-string 0))) | |
;; We render [0-9] as (any "0-9") instead of (any (?0 . ?9)) | |
;; for readability and brevity, and because the latter would | |
;; become (48 . 57) when printed. | |
(setq set (cons range set)) | |
(goto-char (match-end 0)))) | |
((looking-at (rx eos)) | |
(error "unterminated character alternative")) | |
;; plain character (including ^ or -) | |
(t | |
(setq set (cons (char-to-string (following-char)) set)) | |
(forward-char 1)))) | |
;; FIXME: combine several characters into one string (if there is no "-"), | |
;; like (any "a" "b") -> (any "ab") | |
set)) | |
;; Reverse a sequence and concatenate adjacent strings. | |
(defun xr-rev-join-seq (rev-seq) | |
(let ((seq nil)) | |
(while rev-seq | |
(if (and (stringp (car rev-seq)) (stringp (car seq))) | |
(setq seq (cons (concat (car rev-seq) (car seq)) (cdr seq))) | |
(setq seq (cons (car rev-seq) seq))) | |
(setq rev-seq (cdr rev-seq))) | |
seq)) | |
(defun xr-parse-seq () | |
(let ((sequence nil)) ; reversed | |
(while (not (looking-at (rx (or "\\|" "\\)" eos)))) | |
(cond | |
;; nonspecial character | |
((looking-at (rx (not (any "\\*+?.^$[")))) | |
(forward-char 1) | |
(setq sequence (cons (match-string 0) sequence))) | |
;; escaped special | |
((looking-at (rx "\\" (group (any "\\*+?.^$[")))) | |
(forward-char 2) | |
(setq sequence (cons (match-string 1) sequence))) | |
;; group | |
((looking-at (rx "\\(" | |
(opt (group "?" (group (zero-or-more digit)) ":")))) | |
(let ((question (match-string 1)) | |
(number (match-string 2)) | |
(end (match-end 0))) | |
(goto-char end) | |
(let* ((group (xr-parse-alt)) | |
;; optimise - group has an implicit seq | |
(operand (if (and (listp group) (eq (car group) 'seq)) | |
(cdr group) | |
(list group)))) | |
(when (not (looking-at (rx "\\)"))) | |
(error "missing \\)")) | |
(forward-char 2) | |
(let ((item (cond ((not question) ; plain subgroup | |
(cons 'group operand)) | |
((zerop (length number)) ; shy group | |
group) | |
(t | |
(append (list 'group-n (string-to-number number)) | |
operand))))) | |
(setq sequence (cons item sequence)))))) | |
;; * ? + (and non-greedy variants) | |
((looking-at (rx (group (any "*?+")) (opt (group "?")))) | |
(let ((op (match-string 1)) | |
(non-greedy (match-string 2))) | |
(goto-char (match-end 0)) | |
(when (null sequence) | |
(error "postfix operator without operand")) | |
;; While we could use the same symbols as the operator in the regexp, | |
;; ? needs to be escaped in symbols and isn't very neat, so we | |
;; assume that rx-greedy-flag is set. | |
(let* ((sym (cdr (assoc op '(("*" . zero-or-more) | |
("+" . one-or-more) | |
("?" . opt))))) | |
(operand (car sequence)) | |
;; Optimise when the operand is (seq ...) | |
(item | |
(if (and (listp operand) (eq (car operand) 'seq)) | |
(cons sym (cdr operand)) | |
(list sym operand)))) | |
;; BUG: minimal-match affects everything inside, which is not | |
;; what we want. Either keep track of the stuff inside and insert | |
;; maximal-match as appropriate (messy!) or just use the | |
;; *?, ?? and +? symbols. | |
(setq sequence (cons (if non-greedy (list 'minimal-match item) item) | |
(cdr sequence)))))) | |
;; \{..\} | |
((looking-at (rx "\\{" (or (group (one-or-more digit)) | |
(seq | |
(opt (group (one-or-more digit))) | |
"," | |
(opt (group (one-or-more digit))))) | |
"\\}")) | |
(when (null sequence) | |
(error "repetition without operand")) | |
(let ((exactly (match-string 1)) | |
(lower (match-string 2)) | |
(upper (match-string 3))) | |
(goto-char (match-end 0)) | |
(let ((op (cond (exactly | |
(list '= (string-to-number exactly))) | |
((and lower upper) | |
(list 'repeat | |
(string-to-number lower) | |
(string-to-number upper))) | |
(lower | |
(list '>= (string-to-number lower))) | |
(upper | |
(list 'repeat 0 (string-to-number upper))) | |
(t | |
(list 'zero-or-more))))) | |
(setq sequence (cons (append op (list (car sequence))) | |
(cdr sequence)))))) | |
;; character alternative | |
((looking-at (rx "[" (opt (group "^")))) | |
(goto-char (match-end 0)) | |
;; FIXME: optimise (any digit) -> digit etc | |
(let* ((negated (match-string 1)) | |
(set (cons 'any (xr-parse-char-alt)))) | |
(forward-char 1) | |
(setq sequence (cons (if negated (list 'not set) set) sequence)))) | |
;; backref | |
((looking-at (rx "\\" (group digit))) | |
(forward-char 2) | |
(setq sequence | |
(cons (list 'backref (string-to-number (match-string 1))) | |
sequence))) | |
;; various simple substitutions | |
((looking-at (rx (or "." "$" "^" "\\w" "\\W" "\\`" "\\'" "\\=" | |
"\\b" "\\B" "\\<" "\\>" "\\_<" "\\_>"))) | |
(goto-char (match-end 0)) | |
(let ((sym (cdr (assoc | |
(match-string 0) | |
'(("." . nonl) | |
("^" . bol) ("$" . eol) | |
("\\w" . wordchar) ("\\W" . not-wordchar) | |
("\\`" . bos) ("\\'" . eos) | |
("\\=" . point) | |
("\\b" . word-boundary) ("\\B" . not-word-boundary) | |
("\\<" . bow) ("\\>" . eow) | |
("\\_<" . symbol-start) ("\\_>" . symbol-end)))))) | |
(setq sequence (cons sym sequence)))) | |
;; character syntax | |
((looking-at (rx "\\" (group (any "sS")) (group anything))) | |
(let ((negated (string-equal (match-string 1) "S")) | |
(syntax-code (match-string 2))) | |
(goto-char (match-end 0)) | |
(let ((sym (assoc syntax-code | |
'(("-" . whitespace) | |
("." . punctuation) | |
("w" . word) | |
("_" . symbol) | |
("(" . open-parenthesis) | |
(")" . close-parenthesis) | |
("'" . expression-prefix) | |
("\"" . string-quote) | |
("$" . paired-delimiter) | |
("\\" . escape) | |
("/" . character-quote) | |
("<" . comment-start) | |
(">" . comment-end) | |
("|" . string-delimiter) | |
("!" . comment-delimiter))))) | |
(when (not sym) | |
(error "unknown syntax code: %s" syntax-code)) | |
(let ((item (list 'syntax (cdr sym)))) | |
(setq sequence (cons (if negated (list 'not item) item) | |
sequence)))))) | |
;; character categories | |
((looking-at (rx "\\" (group (any "cC")) (group anything))) | |
(let ((negated (string-equal (match-string 1) "C")) | |
(category-code (match-string 2))) | |
(goto-char (match-end 0)) | |
(let ((sym (assoc category-code | |
'(("0" . consonant) | |
("1" . base-vowel) | |
("2" . upper-diacritical-mark) | |
("3" . lower-diacritical-mark) | |
("4" . tone-mark) | |
("5" . symbol) | |
("6" . digit) | |
("7" . vowel-modifying-diacritical-mark) | |
("8" . vowel-sign) | |
("9" . semivowel-lower) | |
("<" . not-at-end-of-line) | |
(">" . not-at-beginning-of-line) | |
("A" . alpha-numeric-two-byte) | |
("C" . chinse-two-byte) | |
("G" . greek-two-byte) | |
("H" . japanese-hiragana-two-byte) | |
("I" . indian-tow-byte) | |
("K" . japanese-katakana-two-byte) | |
("N" . korean-hangul-two-byte) | |
("Y" . cyrillic-two-byte) | |
("^" . combining-diacritic) | |
("a" . ascii) | |
("b" . arabic) | |
("c" . chinese) | |
("e" . ethiopic) | |
("g" . greek) | |
("h" . korean) | |
("i" . indian) | |
("j" . japanese) | |
("k" . japanese-katakana) | |
("l" . latin) | |
("o" . lao) | |
("q" . tibetan) | |
("r" . japanese-roman) | |
("t" . thai) | |
("v" . vietnamese) | |
("w" . hebrew) | |
("y" . cyrillic) | |
("|" . can-break))))) | |
(when (not sym) | |
(error "unknown category code: %s" category-code)) | |
(let ((item (list 'category (cdr sym)))) | |
(setq sequence (cons (if negated (list 'not item) item) | |
sequence)))))) | |
;; error | |
(t | |
(let* ((start (point)) | |
(end (min (+ start 3) (point-max)))) | |
(error "syntax error: %s" (buffer-substring start end)))))) | |
(let ((item-seq (xr-rev-join-seq sequence))) | |
(if (> (length item-seq) 1) | |
(cons 'seq item-seq) | |
(car item-seq))))) | |
(defun xr-parse-alt () | |
(let ((alternatives nil)) ; reversed | |
(while (not (looking-at (rx (or "\\)" eos)))) | |
(setq alternatives (cons (xr-parse-seq) alternatives)) | |
(when (looking-at (rx "\\|")) | |
(forward-char 2))) | |
(if (> (length alternatives) 1) | |
(cons 'or (reverse alternatives)) | |
(car alternatives)))) | |
(defun xr (re-string) | |
"Convert a regexp string to rx notation." | |
(with-temp-buffer | |
(insert re-string) | |
(goto-char (point-min)) | |
(let ((rx (xr-parse-alt))) | |
(when (looking-at (rx "\\)")) | |
(error "unbalanced \\)")) | |
rx))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment