Last active
September 12, 2023 18:19
-
-
Save rntz/249e862b808ee4eeb12a6b138329ff15 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
data Regex = Class [Char] -- character class | |
| Seq [Regex] -- sequence, ABC | |
| Choice [Regex] -- choice, A|B|C | |
| Star Regex -- zero or more, A* | |
deriving (Show) | |
-- The language of a regex is, in theory, defined inductively as follows, substituting | |
-- lists for mathematical sets. | |
naive :: Regex -> [String] | |
naive (Class chars) = [[c] | c <- chars] | |
naive (Seq regexes) = concat <$> sequence (map naive regexes) | |
naive (Choice regexes) = union $ map naive regexes | |
where union = concat -- I did say we were being naive! | |
naive (Star r) = rstar | |
where rstar = [""] ++ ((++) <$> naive r <*> rstar) | |
-- The main problem here is that lists aren't sets, especially when infinity is involved. | |
-- In particular, the following two operations are tricky: | |
-- | |
-- union, {x | x ∈ s or x ∈ t} for Choice | |
-- "append", {x ++ y | x ∈ s, y ∈ t} for Seq | |
-- | |
-- On finite lists, the following approximations are fine: | |
-- | |
-- union xs ys = xs ++ ys | |
-- append xs ys = [x ++ y | x <- xs, y <- ys] | |
-- | |
-- `naive` is defined using n-ary equivalents of these. | |
-- | |
-- But on infinite lists, these are non-exhaustive, because they do not interleave fairly | |
-- between xs and ys. `union` must exhaust xs before it includes anything from ys; while | |
-- `append` must examine every element of xs before it advances a single element in ys. | |
-- | |
-- The next problem is star, and its interaction with termination. For most regexes r, the | |
-- iteration r* is an infinite language. However, if r is "small" (equivalent to either | |
-- Choice [] or Seq [], ie. empty or containing only the empty string), then r* is equal | |
-- to Seq [] (contains only the empty string). In this case, we'd like to compute [""], a | |
-- finite and terminating list. | |
-- | |
-- However, the naive definition | |
-- | |
-- star r = {""} ∪ {x ++ y | x ∈ r, y ∈ star r} | |
-- | |
-- is directly recursive, and its Haskell translation will not terminate. | |
-- We solve the issues with union and append by using a fair interleaving. We solve the | |
-- issue with star by tracking the "size" of a regex: | |
data Size = Empty -- no strings | |
| Small -- at most the empty string | |
| Finite -- includes some non-empty string but is finite | |
| Infinite -- infinite | |
deriving (Show, Eq) | |
instance Ord Size where | |
Empty <= _ = True; _ <= Infinite = True; Small <= Finite = True; x <= y = x == y | |
-- Returns the size of a language & a list of all strings in its language. | |
-- If the language is finite, the list will be too. | |
-- The list is exhaustive, even for infinite languages, but may contain duplicates. | |
-- Size calculation assumes that in (Class chars), chars is finite. | |
eval :: Regex -> (Size, [String]) | |
eval (Class chars) = (if null chars then Empty else Finite, [[c] | c <- chars]) | |
eval (Seq regexes) = (size, foldr appendo [""] langs) | |
where (sizes, langs) = unzip $ map eval regexes | |
size | any (== Empty) sizes = Empty | |
| otherwise = maximum (Small : sizes) | |
eval (Choice regexes) = (maximum (Empty : sizes), union langs) | |
where (sizes, langs) = unzip $ map eval regexes | |
eval (Star r) = (rstarsize, rstarlang) | |
where (rsize, rlang) = eval r | |
rstarsize | rsize <= Small = Small | |
| otherwise = Infinite | |
rstarlang = [""] ++ if rsize <= Small then [] else appendo rlang rstarlang | |
-- Fair implementations of union and append. | |
union :: [[String]] -> [String] | |
union [] = [] | |
union xss = [x | (x:_) <- xss] ++ union [xs | (_:xs) <- xss] -- breadth-first search | |
appendo :: [String] -> [String] -> [String] | |
appendo (x:xs) (y:ys) = (x ++ y) : union [map (x ++) ys, map (++ y) xs, appendo xs ys] | |
appendo _ _ = [] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment