Skip to content

Instantly share code, notes, and snippets.

@soegaard
Created May 5, 2022 10:40
Show Gist options
  • Save soegaard/b1b680702a0c05e2f2fb961565d50dfb to your computer and use it in GitHub Desktop.
Save soegaard/b1b680702a0c05e2f2fb961565d50dfb to your computer and use it in GitHub Desktop.
Regular Expression Matching for S-expressions
#lang racket
;;;
;;; Regular Expression Matching for S-expressions
;;;
;; Exports
(provide regular-match ; the main form
named-groups) ; holds immutable hash table of named groups
;; Imports
(require (for-syntax racket/base syntax/parse racket/syntax racket/format))
;; Literals used in the syntax of regular expressions
(define-syntax ? (λ (stx) (raise-syntax-error '? "used out of context" stx)))
(define-syntax ^ (λ (stx) (raise-syntax-error '^ "used out of context" stx))) ; begin
(define-syntax $ (λ (stx) (raise-syntax-error '$ "used out of context" stx))) ; end
(define-syntax *? (λ (stx) (raise-syntax-error '*? "used out of context" stx))) ; non-greedy star
(define-syntax +? (λ (stx) (raise-syntax-error '+? "used out of context" stx))) ; non-greedy plus
(define-syntax group (λ (stx) (raise-syntax-error 'group "used out of context" stx)))
(define-syntax back (λ (stx) (raise-syntax-error 'back "used out of context" stx)))
;; The grammar for regular expressions
(begin-for-syntax
(define-syntax-class Sequence ; aka concatenation
#:description "regular expression matching a sequence of regular expressions"
(pattern (r1:Regexp ...)))
(define-syntax-class Or ; aka alternation
#:description "regular expression matching a at least one of the regular expressions"
#:literals (/)
(pattern (/ r1:Regexp ...)))
(define-syntax-class Star ; aka Kleene star
#:description "regular expression matching zero or more occurrences: (* regexp)"
#:literals (*)
(pattern (* r:Regexp)))
(define-syntax-class StarNonGreedy
#:description "regular expression matching zero or more occurrences: (*? regexp)"
#:literals (*?)
(pattern (*? r:Regexp)))
(define-syntax-class Plus
#:description "regular expression matching one or more occurrences: (+ regexp]"
#:literals (+)
(pattern (+ r:Regexp)))
(define-syntax-class PlusNonGreedy
#:description "regular expression matching one or more occurrences: (+? regexp]"
#:literals (+?)
(pattern (+? r:Regexp)))
(define-syntax-class Question
#:description "regular expression matching zero or one occurrences: (? regexp)"
#:literals (?)
(pattern (? r:Regexp)))
(define-syntax-class Beginning
#:description "regular expression matching the beginning of the input"
#:literals (^)
(pattern ^))
(define-syntax-class End
#:description "regular expression matching the end of the input"
#:literals ($)
(pattern $))
(define-syntax-class Something
#:description "regular expression matching anything"
#:literals (_)
(pattern _))
(define-syntax-class Group
#:description "named group"
#:literals (group)
(pattern (group name:id r1:Regexp ...)))
(define-syntax-class Back
#:description "back reference"
#:literals (back)
(pattern (back name:id)))
(define-syntax-class Literal
#:description "regular expression match a specific literal"
(pattern (~or r:number
r:char
r:string
r:boolean
r:id)))
(define-syntax-class Regexp
#:description "Regular Expression"
(pattern (~or r:Sequence
r:Or
r:Star
r:StarNonGreedy
r:Plus
r:PlusNonGreedy
r:Question
r:Beginning
r:End
r:Something
r:Literal
r:Group
r:Back
)))
(define-syntax-class MatchClause
#:description "Clause with expression to be matched and corresponding result"
(pattern [e:expr result:expr]))
(define-syntax-class ElseClause
#:description "Clause with expression to be use when no values matched"
#:literals (else)
(pattern [else result:expr])))
;; General Utilities
; Example:
; > (for/list ([xs (in-tails '(a b c))]) xs)
; '((a b c) (b c) (c) ())
(define (in-tails xs)
(make-do-sequence
(λ ()
(define (pos->element p) p)
(define (next-position p) (if (null? p) #f (cdr p)))
(define initial-position xs)
(define (continue-with-pos? p) p)
(values pos->element
next-position
initial-position
continue-with-pos?
#f #f))))
;; Predicates
(define (beginning? re) (eq? re '^))
(define (end? re) (eq? re '$))
(define (something? re) (eq? re '_))
(define (literal? re) (or (number? re) (char? re) (string? re) (boolean? re) (symbol? re)))
(define (star? re) (and (pair? re) (eq? (car re) '*)))
(define (plus? re) (and (pair? re) (eq? (car re) '+)))
(define (question? re) (and (pair? re) (eq? (car re) '?)))
(define (or? re) (and (pair? re) (eq? (car re) '/)))
(define (star/ng? re) (and (pair? re) (eq? (car re) '*?)))
(define (plus/ng? re) (and (pair? re) (eq? (car re) '+?)))
(define (group? re) (and (pair? re) (eq? (car re) 'group)))
(define (back? re) (and (pair? re) (eq? (car re) 'back)))
(define (sequence/beginning? re) (and (pair? re) (beginning? (car re))))
;; Accessors
(define (skip re) (cdr re)) ; skip first regexp in a sequence
(define (sub re) (cadr re)) ; get sub expression in (* re), (+ re) etc.
(define (alts re) (cdr re)) ; get alternatives in (or re1 re2 ...)
(define (group-name re) (cadr re))
(define (group-sequence re) (cddr re))
(define (back-name re) (cadr re))
;; Parameters
; Uses as a way to output information on named groups
(define named-groups (make-parameter (make-immutable-hash)))
;; The Matcher
(define (match-anywhere res input)
; (displayln (list 'match-anywhere res input))
; Note: We can assume that res is a legal sequence regular expression,
; since its syntax has been checked by syntax-parse.
(named-groups (make-immutable-hash))
(if (sequence/beginning? res)
(match-here (skip res) input)
(for/or ([in (in-tails input)])
; (display "> ")
(begin0
(match-here res in)
#;(newline)))))
(define (match-here res input)
; (displayln (list 'match-here res input))
(if (empty? res) input
(let ([re (first res)] [res (skip res)])
(or (and (end? re) (empty? res) (empty? input) '())
(and (something? re) (not (empty? input)) (match-here res (skip input)))
(and (star? re) (match-star (sub re) res input))
(and (star/ng? re) (match-star/ng (sub re) res input))
(and (plus? re) (match-plus (sub re) res input))
(and (plus/ng? re) (match-plus/ng (sub re) res input))
(and (question? re) (match-question (sub re) res input))
(and (group? re) (match-group (group-name re) (group-sequence re) res input))
(and (back? re) (match-back (back-name re) res input))
(and (literal? re) (not (empty? input))
(equal? (first input) re) (match-here res (skip input)))
(and (or? re) (match-or (alts re) res input))
; if re is a list, it is a sequence of regular expressions
(and (list? re) (match-here (append re res) input))
))))
(define (match-star re res input) ; greedy
; (displayln (list 'match-star re res input))
; match re*res at the beginning of input
(let loop ([input input])
(let ([input+ (match-here (list re) input)])
(or (and input+ (match-star re res input+))
(match-here res input)))))
(define (match-star/ng re res input) ; non-greedy: shortest first
; (displayln (list 'match-star re res input))
; match re*res at the beginning of input
(or (match-here res input)
(let ([input+ (match-here (list re) input)])
(and input+
(match-star re res input+)))))
(define (match-plus re res input) ; greedy
; (displayln (list 'match-plus re res input))
; match re+res at the beginning of input
(let ([input+ (match-here (list re) input)])
(and input+
(match-star re res input+))))
(define (match-plus/ng re res input) ; non greedy
; (displayln (list 'match-plus re res input))
; match re+res at the beginning of input
(let ([input+ (match-here (list re) input)])
(and input+
(match-star/ng re res input+))))
(define (match-question re res input) ; greedy
; (displayln (list 'match-question re res input))
; match zero or one occurence of re at the beginning of the input
(let ([input+ (match-here (list re) input)])
(or (and input+ (match-here res input+))
(match-here res input))))
(define (match-or alts res input) ; greedy
; (displayln (list 'match-or alts res input))
; match one of the regular expressions in alts at the beginning of input
(if (empty? alts)
(match-here res input)
(for/or ([alt alts])
(define input+ (match-here (list alt) input))
(and input+ (match-here res input+)))))
(define (match-group name gres res input)
; (displayln (list 'match-group name gres res input))
(define new-groups #f)
(define input+ (match-here gres input))
(define input1 (and input+
(let ([matched (take input (- (length input) (length input+)))])
(parameterize ([named-groups (hash-set (named-groups) name matched)])
(define input++ (match-here res input+))
(when input++
(set! new-groups (named-groups)))
input++))))
(when input1
(named-groups new-groups))
input1)
(define (match-back name res input)
; (displayln (list 'match-back name res input))
(define captured (hash-ref (named-groups) name (box #f)))
(if (box? captured)
(match-here res input) ; should a non-existant back reference give an error?
(let ([input+ (match-here captured input)]) ; clean captured first?
(and input+
(match-here res input+)))))
(define-syntax (regular-match stx)
(syntax-parse stx
[(_regular-match r:Sequence mc:MatchClause ... ec:ElseClause)
(with-syntax ([([to-match result] ...) #'(mc ...)]
[[_ else-result] #'ec])
(syntax/loc stx
(cond
[(match-anywhere 'r to-match) result]
...
[else else-result])))]
[(_regular-match r:Sequence mc:MatchClause ...)
(syntax/loc stx
(regular-match r mc ... [else (void)]))]))
;; Tests
; All tests are supposed to return #t.
#;(and
(regular-match (_) ['(1) #t] [else #f]) ; match something somewhere
(regular-match (_) ['(1 2) #t] [else #f]) ; ditto
(regular-match (_) ['() #f] [else #t]) ; something can't match nothing - but an empty list is not nothing
(regular-match (^ _) ['(1) #t] [else #f]) ; match something at the beginning
(regular-match (^ _) ['(1 2) #t] [else #f]) ; ditto
(regular-match (^ _) ['() #f] [else #t]) ; something can't match nothing
(regular-match (^ _ $) ['(1) #t] [else #f]) ; match something at the beginning and end
(regular-match (^ _ $) ['(1 2) #f] [else #t]) ; 1 is not at end, 2 is not at beginning
(regular-match (^ _ $) ['() #f] [else #t]) ; something can't match nothing
(regular-match (_ $) ['(1) #t] [else #f]) ; match something at the end
(regular-match (_ $) ['(1 2) #t] [else #f]) ; 1 is not at end, but 2 is
(regular-match (_ $) ['() #f] [else #t]) ; something can't match nothing
(regular-match (^ 1 $) ['(1) #t] [else #f]) ; match 1 at the beginning and end
(regular-match (^ 1 $) ['(2) #f] [else #t]) ; 1 does not match 2
(regular-match (1 $) ['(1) #t] [else #t]) ; match 1 at the end
(regular-match (1 $) ['(2) #f] [else #t]) ; 1 does not match 2
(regular-match (1 2 3) ['(1 2 3) #t] [else #f]) ; match 1 2 3 somewhere
(regular-match (1 2 3) ['(1 2 3 4) #t] [else #f]) ; match 1 2 3 somewhere
(regular-match (1 2 3) ['(0 1 2 3) #t] [else #f]) ; match 1 2 3 somewhere
(regular-match (1 2 3) ['(0 1 2 3 4) #t] [else #f]) ; match 1 2 3 somewhere
(regular-match (1 2 3) ['(0 1 2 5 3) #f] [else #t]) ; match 1 2 3 somewhere
(regular-match (^ (* 1) $) ['() #t] [else #f]) ; match 1 zero or more times
(regular-match (^ (* 1) $) ['(1) #t] [else #f]) ; match 1 zero or more times
(regular-match (^ (* 1) $) ['(1 1) #t] [else #f]) ; match 1 zero or more times
(regular-match (^ (* 1) $) ['(1 1 1) #t] [else #f]) ; match 1 zero or more times
(regular-match ((* 1) $) ['(0 1 1) #t] [else #f]) ; match 1 zero or more times at the end
(regular-match (^ (* 1)) ['(1 1 2) #t] [else #f]) ; match 1 zero or more times at the beginning
(regular-match (1 (* _) 2) ['() #f] [else #t]) ; match 1 eventually followed by 2
(regular-match (1 (* _) 2) ['(1) #f] [else #t]) ; match 1 eventually followed by 2
(regular-match (1 (* _) 2) ['(1 3) #f] [else #t]) ; match 1 eventually followed by 2
(regular-match (1 (* _) 2) ['(1 2) #t] [else #f]) ; match 1 eventually followed by 2
(regular-match (1 (* _) 2) ['(1 3 2) #t] [else #f]) ; match 1 eventually followed by 2
(regular-match (1 (* _) 2) ['(1 3 3 2) #t] [else #f]) ; match 1 eventually followed by 2
(regular-match (1 (* _) 2) ['(0 1 3 3 2 5) #t] [else #f]) ; match 1 eventually followed by 2
(regular-match (^ (*? 1) $) ['() #t] [else #f]) ; match 1 zero or more times
(regular-match (^ (*? 1) $) ['(1) #t] [else #f]) ; match 1 zero or more times
(regular-match (^ (*? 1) $) ['(1 1) #t] [else #f]) ; match 1 zero or more times
(regular-match (^ (*? 1) $) ['(1 1 1) #t] [else #f]) ; match 1 zero or more times
(regular-match ((*? 1) $) ['(0 1 1) #t] [else #f]) ; match 1 zero or more times at the end
(regular-match (^ (*? 1)) ['(1 1 2) #t] [else #f]) ; match 1 zero or more times at the beginning
(regular-match (1 (*? _) 2) ['() #f] [else #t]) ; match 1 eventually followed by 2
(regular-match (1 (*? _) 2) ['(1) #f] [else #t]) ; match 1 eventually followed by 2
(regular-match (1 (*? _) 2) ['(1 3) #f] [else #t]) ; match 1 eventually followed by 2
(regular-match (1 (*? _) 2) ['(1 2) #t] [else #f]) ; match 1 eventually followed by 2
(regular-match (1 (*? _) 2) ['(1 3 2) #t] [else #f]) ; match 1 eventually followed by 2
(regular-match (1 (*? _) 2) ['(1 3 3 2) #t] [else #f]) ; match 1 eventually followed by 2
(regular-match (1 (*? _) 2) ['(0 1 3 3 2 5) #t] [else #f]) ; match 1 eventually followed by 2
(regular-match (^ (+ 1) $) ['() #f] [else #t]) ; match 1 zero or more times
(regular-match (^ (+ 1) $) ['(1) #t] [else #f]) ; match 1 zero or more times
(regular-match (^ (+ 1) $) ['(2) #f] [else #t]) ; match 1 zero or more times
(regular-match (^ (+ 1) $) ['(1 1) #t] [else #f]) ; match 1 zero or more times
(regular-match (^ (+ 1) $) ['(1 1 1) #t] [else #f]) ; match 1 zero or more times
(regular-match ((+ 1) $) ['(0 1 1) #t] [else #f]) ; match 1 zero or more times at the end
(regular-match (^ (+ 1)) ['(1 1 2) #t] [else #f]) ; match 1 zero or more times at the beginning
(regular-match (1 (+ _) 2) ['() #f] [else #t]) ; match 1, something eventually followed by 2
(regular-match (1 (+ _) 2) ['(1) #f] [else #t]) ;
(regular-match (1 (+ _) 2) ['(1 3) #f] [else #t]) ;
(regular-match (1 (+ _) 2) ['(1 2) #f] [else #t]) ;
(regular-match (1 (+ _) 2) ['(1 3 2) #t] [else #f]) ;
(regular-match (1 (+ _) 2) ['(1 3 3 2) #t] [else #f]) ;
(regular-match (1 (+ _) 2) ['(0 1 3 3 2 5) #t] [else #f]) ;
(regular-match (1 (? _) 2) ['() #f] [else #t]) ; match 1 2 maybe something between
(regular-match (1 (? _) 2) ['(1) #f] [else #t]) ;
(regular-match (1 (? _) 2) ['(1 3) #f] [else #t]) ;
(regular-match (1 (? _) 2) ['(1 2) #t] [else #f]) ;
(regular-match (1 (? _) 2) ['(1 3 2) #t] [else #f]) ;
(regular-match (1 (? _) 2) ['(1 3 3 2) #f] [else #t]) ;
(regular-match (1 (? _) 2) ['(0 1 3 3 2 5) #f] [else #t]) ;
(regular-match (^ (? 1) $) ['() #t] [else #f]) ; match 1 zero or one times
(regular-match (^ (? 1) $) ['(1) #t] [else #f]) ; match 1 zero or one times
(regular-match (^ (? 1) $) ['(2) #f] [else #t]) ; match 1 zero or one times
(regular-match (^ (? 1) $) ['(1 1) #f] [else #t]) ; match 1 zero or one times
(regular-match (^ (? 1) $) ['(1 1 1) #f] [else #t]) ; match 1 zero or one times
(regular-match ((? 1) $) ['(0 1 1) #t] [else #f]) ; match 1 zero or one times at the end
(regular-match ((? 1) $) ['(0 2 2) #t] [else #f]) ; match 1 zero or one times at the end
(regular-match (^ (? 1)) ['(1 1 2) #t] [else #f]) ; match 1 zero or one times at the beginning
(regular-match (^ (? 1)) ['(2 2 2) #t] [else #f]) ; match 1 zero or one times at the beginning
(regular-match (^ (/ 1 2) $) ['() #f] [else #t]) ; match 1 or 2
(regular-match (^ (/ 1 2) $) ['(1) #t] [else #f]) ; match 1 or 2
(regular-match (^ (/ 1 2) $) ['(2) #t] [else #f]) ; match 1 or 2
(regular-match (^ (/ 1 2) $) ['(3) #f] [else #t]) ; match 1 or 2
(regular-match (^ (/ 1 2) $) ['(1 3) #f] [else #t]) ; match 1 or 2
(regular-match (^ (/ 1 2) $) ['(1 2) #f] [else #t]) ; match 1 or 2
(regular-match (^ (group a 1) 2) ['(1 2) #t] [else #f])
(regular-match (^ (group a (* 1)) 2 $) ['(1 1 2) #t] [else #f])
(regular-match (^ (group a (* 1)) 2 (back a) $) ['(1 1 2 1 1) #t] [else #f]) ; match same number before and after 2
(regular-match (^ (group a (* 1)) 2 (back a) $) ['(1 1 2 1 1 1) #f] [else #t]) ; match same number before and after 2
(regular-match (^ (group a (* 1)) (group b (* 2)) (back a) (back b) $) ['(1 1 2 1 1 2) #t] [else #f])
(regular-match (^ (group a (* 1)) (group b (* 2)) (back a) (back b) $) ['(1 1 2 1 1 2 2) #f] [else #t])
(regular-match (^ (group a _) (group as (* (back a)))
(group b _) (group bs (* (back b)))
(back a) (back as)
(back b) (back bs) $) ['(1 1 2 2 2 1 1 2 2 2) #t] [else #f])
(regular-match (^ (group a _) (group as (* (back a)))
(group b _) (group bs (* (back b)))
(back a) (back as)
(back b) (back bs) $) ['(1 1 2 2 2 1 1 2 2 2 2) #f] [else #t])
; check that *? and * are non-greedy and greedy respectively
(equal? (match-here '((*? 1)) '(1 1 1)) '(1 1 1)) ; non greedy
(equal? (match-here '((* 1)) '(1 1 1)) '()) ; greedy
; check that +? and + are non-greedy and greedy respectively
(equal? (match-here '((+? 1)) '(1 1 1)) '(1 1)) ; non greedy
(equal? (match-here '((+ 1)) '(1 1 1)) '()) ; greedy
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment