Skip to content

Instantly share code, notes, and snippets.

@Perigord-Kleisli
Last active November 1, 2022 09:37
Show Gist options
  • Save Perigord-Kleisli/94fc1b196703e088b840fdb1f8f9302a to your computer and use it in GitHub Desktop.
Save Perigord-Kleisli/94fc1b196703e088b840fdb1f8f9302a to your computer and use it in GitHub Desktop.
Simple Regex Parser and Engine
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE TupleSections #-}
module Main (
main
) where
import Text.Megaparsec hiding (match)
import Text.Megaparsec.Char
import Data.Void
import Control.Monad
import Data.Functor
import Text.Megaparsec.Char.Lexer
import Data.Function
import Data.Maybe
data RegExp = Normal Char -- ^ A character that is not in "()*|."
| Any -- ^ Any character
| Beginning
| End
| ZeroOrMore RegExp -- ^ Zero or more occurances of the same regexp
| OneOrMore RegExp
| Possible RegExp
| AnyOf String
| Quantifier RegExp Int (Maybe Int)
| NoneOf String
| Or RegExp RegExp -- ^ A choice between 2 regexps
| Str [RegExp] -- ^ A sequence of regexps.
deriving (Show, Eq)
type Parser a = Parsec Void String a
expandRanges :: String -> String
expandRanges (a:'-':b:xs) = [a..b] <> expandRanges xs
expandRanges (x:xs) = x : expandRanges xs
expandRanges [] = []
normal, anyP, beginning, end, anyOf, noneOfP, parens, regexTerm, charExp, orP, str, regexp :: Parser RegExp
normal = Normal <$> noneOf "()*|."
anyP = Any <$ char '.'
beginning = Beginning <$ char '^'
end = End <$ char '$'
anyOf = AnyOf . expandRanges <$> (char '[' *> manyTill anySingle (char ']'))
noneOfP = NoneOf . expandRanges <$> (string "[^" *> manyTill anySingle (char ']'))
parens = char '(' *> regexp <* char ')'
regexTerm = parens <|> noneOfP <|> anyOf <|> beginning <|> end <|> anyP <|> normal
charExp = do
tok <- regexTerm
let rangeP = char '{' *> ((,) <$> (toEnum <$> decimal) <*> (char ',' *> optional decimal)) <* char '}'
(((,Nothing) <$> oneOf "+?*") <|> rangeP) <&> \case
('*',_) -> ZeroOrMore tok
('+',_) -> OneOrMore tok
('?',_) -> Possible tok
(x,y) -> Quantifier tok (fromEnum x) y
str = (Str .) . (:) <$> (try charExp <|> regexTerm)
<*> some (try charExp <|> regexTerm)
orP = Or <$> ((try str <|> try charExp <|> regexTerm) <* char '|')
<*> (try str <|> try charExp <|> regexTerm)
regexp = try orP <|> try str <|> try charExp <|> regexTerm
parseRegExp :: String -> Maybe RegExp
parseRegExp = either (const Nothing) Just . parse (regexp <* eof) ""
regexToParser :: RegExp -> Parser String
regexToParser (Normal c) = pure <$> char c
regexToParser Any = pure <$> anySingle
regexToParser Beginning = pure ""
regexToParser End = pure ""
regexToParser (ZeroOrMore rgx) = concat <$> many (regexToParser rgx)
regexToParser (OneOrMore rgx) = concat <$> some (regexToParser rgx)
regexToParser (Possible rgx) = fromMaybe "" <$> optional (regexToParser rgx)
regexToParser (Quantifier rgx minMatch (Just maxMatch)) =
(mappend `on` concat) <$> replicateM minMatch (regexToParser rgx)
<*> choice [try $ replicateM x (regexToParser rgx) | x <- reverse [1..maxMatch-minMatch]]
regexToParser (Quantifier rgx minMatch Nothing) =
(mappend `on` concat) <$> replicateM minMatch (regexToParser rgx)
<*> many (regexToParser rgx)
regexToParser (AnyOf chars) = pure <$> oneOf chars
regexToParser (NoneOf chars) = pure <$> noneOf chars
regexToParser (Or rgxA rgxB) = regexToParser rgxA <|> regexToParser rgxB
regexToParser (Str rgxs) = concat <$> traverse regexToParser rgxs
regex :: String -> String -> Maybe String
regex regexExpr s = do
parser <- regexToParser <$> parseRegExp regexExpr
either (const Nothing) Just $ parse parser "" s
validate :: (Eq a, Show a) => (String -> a) -> [(String, a)] -> IO ()
validate f = mapM_ $ \(a,b) -> do
unless (f a == b) (fail $ "Expected: \n" ++ show b ++ "\nBut got:\n" ++ show (f a))
validateRegex :: [(String,String,Maybe String)] -> IO ()
validateRegex = mapM_ $ \(a,b,c) -> do
unless (regex a b == c) (fail $ "Expected: \n" ++ show c ++ "\nBut got:\n" ++ show (regex a b))
main :: IO ()
main = do
validate parseRegExp
[ ( "ab*", Just (Str [Normal 'a', ZeroOrMore (Normal 'b')]) )
, ( "(ab)*", Just ( ZeroOrMore (Str [Normal 'a', Normal 'b'])) )
, ( "(ab)+", Just ( OneOrMore (Str [Normal 'a', Normal 'b'])) )
, ( "(ab){1,3}", Just ( Quantifier (Str [Normal 'a', Normal 'b']) 1 (Just 3) ) )
, ( "(ab){1,}", Just ( Quantifier (Str [Normal 'a', Normal 'b']) 1 Nothing ) )
, ( "(ab)?", Just ( Possible (Str [Normal 'a', Normal 'b'])) )
, ( "[abkj]?", Just ( Possible (AnyOf "abkj")) )
, ( "[a-e]?", Just ( Possible (AnyOf "abcde")) )
, ( "[0-8]?", Just ( Possible (AnyOf "012345678")) )
, ( "[^abkj]?", Just ( Possible (NoneOf "abkj")) )
, ( "(ab)+*", Nothing)
, ( "(ab)=", Just (Str [Str [Normal 'a', Normal 'b'], Normal '=']) )
, ( "^(ab)=$", Just (Str [Beginning, Str [Normal 'a', Normal 'b'], Normal '=', End]) )
, ( "ab|a", Just (Or (Str [Normal 'a',Normal 'b']) (Normal 'a')) )
, ( "ab|a", Just (Or (Str [Normal 'a',Normal 'b']) (Normal 'a')) )
, ( "a(b|a)", Just (Str [Normal 'a',Or (Normal 'b') (Normal 'a')]) )
, ( "a|b*", Just (Or (Normal 'a') (ZeroOrMore (Normal 'b'))) )
, ( "(a|b)*", Just (ZeroOrMore (Or (Normal 'a') (Normal 'b'))) )
, ( "a(", Nothing)
]
validateRegex [(".","aasd",Just "a")
,("a+","b",Nothing)
,("a*","s", Just "")
,("colou?r","color", Just "color")
,("colou?r","colour", Just "colour")
,("[abc]+","cbaba", Just "cbaba")
,("[a-z]+","kljkljij", Just "kljkljij")
,("[^abc]+","skjlj", Just "skjlj")
,(".*","asdlkjlkjls", Just "asdlkjlkjls")
,("(sa ){1,2}", "sa sa sa ", Just "sa sa ")
,("(sa ){2,10}", "sa sa sa ", Just "sa sa sa ")
,("(sa |ab,){2,10}", "sa ab,sa ", Just "sa ab,sa ")
,("(sa ){2,}", "sa sa sa ", Just "sa sa sa ")
,("w{1,3}[.]colo?[rsd][.](com|ne*t)", "www.color.com so", Just "www.color.com")
,("w{1,3}[.]colo?[rsd][.](com|ne*t)", "www.color.net so", Just "www.color.net")
,("w{1,3}[.]colo?[rsd][.](com|ne*t)", "www.color.neeeeet so", Just "www.color.neeeeet")
,("w{1,3}[.]colo?[rsd][.](com|ne*t)", "ww.colos.com", Just "ww.colos.com")
,("w{1,3}[.]colo?[rsd][.](com|ne*t)", "wwww.colos.com", Nothing)
]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment