Skip to content

Instantly share code, notes, and snippets.

@drsimonj
Created September 2, 2018 12:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save drsimonj/0646565951204538d72f27a672d8ed60 to your computer and use it in GitHub Desktop.
Save drsimonj/0646565951204538d72f27a672d8ed60 to your computer and use it in GitHub Desktop.
regex - recursive EBNF
## BASE FUNCTION
base <- function(obj) {
UseMethod("base")
}
base.default <- function(obj) {
as.character(obj)
}
base.regex <- function(obj) {
paste0("(", obj, ")")
}
base("a")
base(1)
x <- "a"
class(x) <- c("regex", class(x))
base(x)
# QUANTIFIED FUNCTION (FACTOR)
quantified <- function(base, quantifier_min = 1, quantifier_max = 1) {
paste0(base, "{", quantifier_min, ",", quantifier_max, "}")
}
quantified(base("a"))
quantified(base(x))
# TERM FUNCTION
term <- function(...) {
quantified <- c(...)
paste0(quantified, collapse = "")
}
term(quantified(base("a")), quantified(base("b")))
term(c(quantified(base("a")), quantified(base("b"))))
term(c(quantified(base("a")), quantified(base("b"))), quantified(base(x)))
# REGEX FUNCTION
regex <- function(term, reg) {
if (!missing(reg)) {
term <- paste(term, base(reg), sep = "|")
}
class(term) <- c("regex", class(term))
term
}
regex(term(quantified(base("a")), quantified(base("b"))))
y <- regex(term(quantified(base("a")), quantified(base("b"))))
test_regex <- regex(
term(quantified(base("x")), quantified(base("y"))),
y
)
regex
regex(
term(quantified(base(1))),
test_regex
)
stringr::str_extract_all("thda jd skj xy", test_regex)
@martijnvanbeers
Copy link

Our adjusted EBNF grammar

<regex>         := <term> '|' <regex> | <term>                                                                                                                                                 
                                                                                                                                                                                               
<term>          := { <quantified> }                                                                                                                                                            
                                                                                                                                                                                               
<quantified>    := <base> [ '{', <num>, ',', <num>, '}' ]                                                                                                                                      
                                                                                                                                                                                               
<base>          := <char> |                                                                                                                                                                    
                '(', <regex>, ')'                                                                                                                                                              
                                                                                                                                                                                               
<char>          := 'a' | 'b' | ... | 'z' | 'A' | ... | 'Z' | '0' | ... | '9'                                                                                                                   

@drsimonj
Copy link
Author

drsimonj commented Sep 2, 2018

A more advanced example (truer to the grammar):

### GRAMMAR FUNCTIONS -----------------

## BASE FUNCTION
base <- function(x, ...) {
  UseMethod("base")
}

base.character <- function(x, ...) {
  class(x) <- c("base", "character")
  x
}

base.regex <- function(x, ...) {
  x <- paste0("(", x, ")")
  base.character(x, ...)
}

base("a")
base(1)

zz <- "a"
class(zz) <- c("regex", class(zz))
base(zz)

# QUANTIFIED FUNCTION (FACTOR)
quantify <- function(x, min = 1, max = 1, ...) {
  UseMethod("quantify")
}

quantify.base <- function(x, min = 1, max = 1, ...) {
  min <- as.integer(min)
  
  if (!is.integer(min) | min < 0L)
    stop("`min` is not an integer of 0 or more")
  
  if (!is.infinite(max)) {
    max <- as.integer(max)
    if (!is.integer(max) | (max < min))
      stop("`max` is not an integer greater than or equal to `min`")
  } else {
    max <- ""
  }

  if (!(min == 1 & max == 1))
    x <- paste0(x, "{", min, ",", max, "}")
  
  class(x) <- c("quantified", "character")
  x
}

quantify(base("a"))
quantify(base("a"), min = 0)
quantify(base("a"), min = 3, 10)
quantify(base(zz))
quantify(base(zz), max = Inf)

# TERM FUNCTION
term <- function(...) {
  UseMethod("term")
}

term.quantified <- function(...) {
  xs <- c(...)
  
  for (i in seq_along(xs)) {
    x <- xs[[i]]
    x_class <- class(x)
    if (!"quantified" %in% x_class) {
      if (!"base" %in% x_class) {
        x <- base(x)
      }
      x <- quantify(x)
      xs[[i]] <- x
    }
  }
  
  xs <- paste0(xs, collapse = "")
  class(xs) <- c("term", "character")
  xs
}

term("a", "b")
term("a", quantify("b", max = 2))
term(quantify("a", min = 0), quantify("b", max = 2))

# REGEX FUNCTION
regex <- function(x, y, ...) {
  UseMethod("regex")
}

regex.term <- function(x, y, ...) {
  if (!missing(y)) {
    if (!"regex" %in% class(y))
      stop("`y` must be a regex object")
    
    x <- paste(x, y, sep = "|")
  }
  
  class(x) <- c("regex", "character")
  x
}

regex("a")
regex(term("a"))

regex(term(quantify(base("a"))))

regex(term(quantify(base("a"))), regex(term(quantify(base("b")))))

regex(term(quantify(base("a"))),
      regex(term(quantify(base("b"))),
            regex(term(quantify(base("c"))))))

quantify(base(
  regex(term(quantify(base("a"))),
      regex(term(quantify(base("b"))),
            regex(term(quantify(base("c"))))))), max = Inf)

# Testing -----------------------------------------------------------------

s <- "col cal cel cil caal cool coal citl"

stringr::str_extract_all(s, "col")

pattern <- regex(term(
  quantify(base("c")),
  
  quantify(base(regex(term(quantify(base("a"))),
          regex(term(quantify(base("o"))),
                regex(term(quantify(base("c"))))))), min = 1, max = 2),
  
  quantify(base("l"))
))

stringr::str_extract_all(s, pattern)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment