Last active
March 5, 2024 09:01
-
-
Save chaoxu/5210853 to your computer and use it in GitHub Desktop.
Build a regular expression to match all non-negative integers between a and b.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module RegIntIntval (RegEx, matchIntRange, matchLessThan, matchGreaterThan) where | |
import Data.Digits | |
import Text.Regex.PCRE.Light | |
import Data.ByteString (pack) | |
import Data.ByteString.Internal (c2w) | |
import Data.Maybe | |
import Data.List | |
import FreeMonoidCompress | |
-- build regular expression for positive integer ranges | |
-- in base 10. The program uses operations [x-y],|,\d,*,(),{} | |
-- Internal representaion | |
data RegEx = Range Int Int | All | AtLeast RegEx Int | |
| Or [RegEx] | Concat [RegEx] | Repetition RegEx Int Int | |
| Epsilon | Line RegEx | |
deriving (Eq, Ord)-- , Show) | |
-- You could also use "[0-9]" if you like | |
alphabet = "\\d" | |
-- Output the regular expression | |
instance Show RegEx where | |
show (Range i j) | |
| i == j = show i | |
| i+1 == j = concat ["[",show i,show j,"]"] | |
| i+2 == j = concat ["[",show i,show (i+1), show (i+2),"]"] | |
| otherwise = concat ["[",show i,"-",show j,"]"] | |
show (Or xs) = "("++ tail (concatMap (\x->'|':show x) xs) ++ ")" | |
show (Concat xs) = concatMap show xs | |
show (Repetition x n m) | |
| n /= m = concat [c,"{",show n,",",show m,"}"] | |
| length a < length b = a | |
| otherwise = b | |
where a = concat [c,"{",show n,"}"] | |
b = concat (replicate n (show x)) | |
c = if atomic x then show x else "(" ++ show x ++ ")" | |
show Epsilon = "" | |
show (AtLeast x n) | |
| n == 0 = show x ++ "*" | |
| n == 1 = show x ++ "+" | |
| length a < length b = a | |
| otherwise = b | |
where a = concat (replicate n (show x))++"+" | |
b = concat [show x,"{",show n,",}"] | |
show (Line x) = "^" ++ show x ++ "$" | |
show All = alphabet | |
atomic All = True | |
atomic (Range _ _) = True | |
atomic (Concat xs) | |
| length xs > 1 = False | |
| otherwise = atomic (head xs) | |
atomic (Or _) = True | |
atomic Epsilon = True | |
atomic (AtLeast _ _) = False | |
-- Main method. Match integers in a certain range | |
matchIntRange :: Integer->Integer->RegEx | |
matchIntRange a b | |
| 0 > min a b = error "Negative input" | |
| a > b = Line Epsilon | |
| otherwise = Line (mergeBetter $ reduce $ build (d a) (d b)) | |
-- | otherwise = Line (reduce $ build (d a) (d b)) | |
where build :: [Int]->[Int]->RegEx | |
build [] [] = Concat [] | |
build (a@(x:xs)) (b@(y:ys)) | |
| sl && x == y = Concat [Range x x, build xs ys] | |
| sl && all9 && all0 = Concat [Range x y, Repetition All n n] | |
| sl && all0 = Or [Concat [Range x (y-1), Repetition All n n], upper] | |
| sl && all9 = Or [lower,Concat [Range (x+1) y, Repetition All n n]] | |
| sl && x+1 <= y-1 = Or [lower, middle, upper] | |
| sl = Or [lower, upper] | |
| otherwise = Or [build a (nines la), build (1:zeros la) b] | |
where (la,lb) = (length a, length b) | |
sl = la == lb | |
n = length xs | |
upper = Concat [Range y y, build (zeros n) ys] | |
lower = Concat [Range x x, build xs (nines n)] | |
middle = Concat [Range (x+1) (y-1), Repetition All n n] | |
all9 = all (==9) ys | |
all0 = all (==0) xs | |
zeros n = replicate n 0 | |
nines n = replicate n 9 | |
d 0 = [0] | |
d n = map fromIntegral $ digits 10 n | |
-- Transforms | |
-- We should reduce the associative operations | |
-- After reduce, (Or xs) would be (Or ys), where ys contain no Or's. | |
reduce :: RegEx->RegEx | |
reduce (Or xs) | |
| length xs == 1 = head xs | |
| otherwise = Or (reverse $ foldl combine [] result) | |
where result = map reduce xs | |
combine agg (Or ys) = reverse ys++agg | |
combine agg n = n:agg | |
reduce (Concat xs) | |
| length xs == 1 = head xs | |
| otherwise = Concat (reverse $ foldl combine [] result) | |
where result = map reduce xs | |
combine agg (Concat ys) = reverse ys ++ agg | |
combine agg n = n:agg | |
reduce x = x | |
-- merge repeated set of all strings together | |
-- commutative thus sorting is nice | |
mergeAll :: RegEx->RegEx | |
mergeAll (Or xs) = Or (f [] v) | |
where v :: [RegEx] | |
v = sort $ map mergeAll xs | |
u :: RegEx->[RegEx] | |
u (Concat x) = x | |
u x = [] | |
f :: [RegEx]->[RegEx]->[RegEx] | |
f [] (x:xs) = f [x] xs | |
f accum [] = accum | |
f accum xs = f (init accum ++ merges (last accum) (head xs)) (tail xs) | |
merges :: RegEx->RegEx->[RegEx] | |
merges xs ys | |
| notnull && px == py = m sx sy | |
| otherwise = [xs,ys] | |
where m (Repetition x a b) (Repetition y c d) | |
| x == y && a<=d && b>=c || b+1 == c = [Concat (px ++ [Repetition x (min a c) (max b d)])] | |
| otherwise = [xs,ys] | |
m (Repetition x a b) (AtLeast y c) | |
| x == y && c<=b+1 = [Concat (px ++ [AtLeast x (min a c)])] | |
| otherwise = [xs,ys] | |
m x y = [xs,ys] | |
(px,sx) = (init $ u xs, last $ u xs) | |
(py,sy) = (init $ u ys, last $ u ys) | |
notnull = min (length $ u xs) (length $ u ys) > 0 | |
-- Not commutative | |
mergeAll (Concat xs) = mergeMonoid (Concat (map mergeAll xs)) | |
mergeAll x = x | |
atom (Power _ _) = False | |
atom (Atom x) = atomic x | |
atom (List xs) -- = False | |
| length xs > 1 = False | |
| otherwise = atom (head xs) | |
mergeMonoid (Concat xs) = Concat (expand result) | |
where result = freeMonoidCompress regexWeight xs | |
expand (Atom x) = [x] | |
expand (List xs) = concatMap expand xs | |
expand (Power x n) = [Repetition (Concat (expand x)) n n] | |
regexWeight (Atom x) = length $ show x | |
regexWeight (List xs) = sum $ map regexWeight xs | |
regexWeight (Power x n) = min (regexWeight $ List (replicate n x)) (overhead1 + 2 + length (show n) + w) | |
where w = regexWeight x | |
overhead1 = if (atom x) then 0 else 2 | |
-- because we like the ordering a lot | |
mergeBetter x | |
| length (show x) > length (show y) = y | |
| otherwise = x | |
where y = mergeAll x | |
matchLessThan :: Integer->RegEx | |
matchLessThan n = Line (matchIntRange 0 n) | |
matchGreaterThan :: Integer->RegEx | |
matchGreaterThan a = Line (mergeBetter $ reduce $ Or [matchIntRange a b,Concat [Range 1 9, AtLeast All n]]) | |
where n = length $ digits 10 a | |
b = unDigits 10 $ replicate n 9 | |
-- Tests | |
--Assert if matchIntRange i j works | |
test i j = all isJust [match re (num k) []|k<-[i..j]] && | |
all isNothing [match re (num k) []|k<-[0..i-1]] && | |
all isNothing [match re (num k) []|k<-[j+1..9*j]] | |
where re = compile (s2w ("^("++show (mergeAll $ matchIntRange i j)++")$") ) [] | |
num n = s2w $ show n | |
s2w = pack . map c2w |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment