Skip to content

Instantly share code, notes, and snippets.

@chaoxu
Last active March 5, 2024 09:01
Show Gist options
  • Save chaoxu/5210853 to your computer and use it in GitHub Desktop.
Save chaoxu/5210853 to your computer and use it in GitHub Desktop.
Build a regular expression to match all non-negative integers between a and b.
module RegIntIntval (RegEx, matchIntRange, matchLessThan, matchGreaterThan) where
import Data.Digits
import Text.Regex.PCRE.Light
import Data.ByteString (pack)
import Data.ByteString.Internal (c2w)
import Data.Maybe
import Data.List
import FreeMonoidCompress
-- build regular expression for positive integer ranges
-- in base 10. The program uses operations [x-y],|,\d,*,(),{}
-- Internal representaion
data RegEx = Range Int Int | All | AtLeast RegEx Int
| Or [RegEx] | Concat [RegEx] | Repetition RegEx Int Int
| Epsilon | Line RegEx
deriving (Eq, Ord)-- , Show)
-- You could also use "[0-9]" if you like
alphabet = "\\d"
-- Output the regular expression
instance Show RegEx where
show (Range i j)
| i == j = show i
| i+1 == j = concat ["[",show i,show j,"]"]
| i+2 == j = concat ["[",show i,show (i+1), show (i+2),"]"]
| otherwise = concat ["[",show i,"-",show j,"]"]
show (Or xs) = "("++ tail (concatMap (\x->'|':show x) xs) ++ ")"
show (Concat xs) = concatMap show xs
show (Repetition x n m)
| n /= m = concat [c,"{",show n,",",show m,"}"]
| length a < length b = a
| otherwise = b
where a = concat [c,"{",show n,"}"]
b = concat (replicate n (show x))
c = if atomic x then show x else "(" ++ show x ++ ")"
show Epsilon = ""
show (AtLeast x n)
| n == 0 = show x ++ "*"
| n == 1 = show x ++ "+"
| length a < length b = a
| otherwise = b
where a = concat (replicate n (show x))++"+"
b = concat [show x,"{",show n,",}"]
show (Line x) = "^" ++ show x ++ "$"
show All = alphabet
atomic All = True
atomic (Range _ _) = True
atomic (Concat xs)
| length xs > 1 = False
| otherwise = atomic (head xs)
atomic (Or _) = True
atomic Epsilon = True
atomic (AtLeast _ _) = False
-- Main method. Match integers in a certain range
matchIntRange :: Integer->Integer->RegEx
matchIntRange a b
| 0 > min a b = error "Negative input"
| a > b = Line Epsilon
| otherwise = Line (mergeBetter $ reduce $ build (d a) (d b))
-- | otherwise = Line (reduce $ build (d a) (d b))
where build :: [Int]->[Int]->RegEx
build [] [] = Concat []
build (a@(x:xs)) (b@(y:ys))
| sl && x == y = Concat [Range x x, build xs ys]
| sl && all9 && all0 = Concat [Range x y, Repetition All n n]
| sl && all0 = Or [Concat [Range x (y-1), Repetition All n n], upper]
| sl && all9 = Or [lower,Concat [Range (x+1) y, Repetition All n n]]
| sl && x+1 <= y-1 = Or [lower, middle, upper]
| sl = Or [lower, upper]
| otherwise = Or [build a (nines la), build (1:zeros la) b]
where (la,lb) = (length a, length b)
sl = la == lb
n = length xs
upper = Concat [Range y y, build (zeros n) ys]
lower = Concat [Range x x, build xs (nines n)]
middle = Concat [Range (x+1) (y-1), Repetition All n n]
all9 = all (==9) ys
all0 = all (==0) xs
zeros n = replicate n 0
nines n = replicate n 9
d 0 = [0]
d n = map fromIntegral $ digits 10 n
-- Transforms
-- We should reduce the associative operations
-- After reduce, (Or xs) would be (Or ys), where ys contain no Or's.
reduce :: RegEx->RegEx
reduce (Or xs)
| length xs == 1 = head xs
| otherwise = Or (reverse $ foldl combine [] result)
where result = map reduce xs
combine agg (Or ys) = reverse ys++agg
combine agg n = n:agg
reduce (Concat xs)
| length xs == 1 = head xs
| otherwise = Concat (reverse $ foldl combine [] result)
where result = map reduce xs
combine agg (Concat ys) = reverse ys ++ agg
combine agg n = n:agg
reduce x = x
-- merge repeated set of all strings together
-- commutative thus sorting is nice
mergeAll :: RegEx->RegEx
mergeAll (Or xs) = Or (f [] v)
where v :: [RegEx]
v = sort $ map mergeAll xs
u :: RegEx->[RegEx]
u (Concat x) = x
u x = []
f :: [RegEx]->[RegEx]->[RegEx]
f [] (x:xs) = f [x] xs
f accum [] = accum
f accum xs = f (init accum ++ merges (last accum) (head xs)) (tail xs)
merges :: RegEx->RegEx->[RegEx]
merges xs ys
| notnull && px == py = m sx sy
| otherwise = [xs,ys]
where m (Repetition x a b) (Repetition y c d)
| x == y && a<=d && b>=c || b+1 == c = [Concat (px ++ [Repetition x (min a c) (max b d)])]
| otherwise = [xs,ys]
m (Repetition x a b) (AtLeast y c)
| x == y && c<=b+1 = [Concat (px ++ [AtLeast x (min a c)])]
| otherwise = [xs,ys]
m x y = [xs,ys]
(px,sx) = (init $ u xs, last $ u xs)
(py,sy) = (init $ u ys, last $ u ys)
notnull = min (length $ u xs) (length $ u ys) > 0
-- Not commutative
mergeAll (Concat xs) = mergeMonoid (Concat (map mergeAll xs))
mergeAll x = x
atom (Power _ _) = False
atom (Atom x) = atomic x
atom (List xs) -- = False
| length xs > 1 = False
| otherwise = atom (head xs)
mergeMonoid (Concat xs) = Concat (expand result)
where result = freeMonoidCompress regexWeight xs
expand (Atom x) = [x]
expand (List xs) = concatMap expand xs
expand (Power x n) = [Repetition (Concat (expand x)) n n]
regexWeight (Atom x) = length $ show x
regexWeight (List xs) = sum $ map regexWeight xs
regexWeight (Power x n) = min (regexWeight $ List (replicate n x)) (overhead1 + 2 + length (show n) + w)
where w = regexWeight x
overhead1 = if (atom x) then 0 else 2
-- because we like the ordering a lot
mergeBetter x
| length (show x) > length (show y) = y
| otherwise = x
where y = mergeAll x
matchLessThan :: Integer->RegEx
matchLessThan n = Line (matchIntRange 0 n)
matchGreaterThan :: Integer->RegEx
matchGreaterThan a = Line (mergeBetter $ reduce $ Or [matchIntRange a b,Concat [Range 1 9, AtLeast All n]])
where n = length $ digits 10 a
b = unDigits 10 $ replicate n 9
-- Tests
--Assert if matchIntRange i j works
test i j = all isJust [match re (num k) []|k<-[i..j]] &&
all isNothing [match re (num k) []|k<-[0..i-1]] &&
all isNothing [match re (num k) []|k<-[j+1..9*j]]
where re = compile (s2w ("^("++show (mergeAll $ matchIntRange i j)++")$") ) []
num n = s2w $ show n
s2w = pack . map c2w
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment