Last active
August 29, 2015 13:57
-
-
Save mniip/9703507 to your computer and use it in GitHub Desktop.
A regex-like pattern matcher operating on arbitrary data types.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module Patterns | |
( | |
captures, match, residue, | |
maybeCaptures, maybeMatch, gsub, | |
capture, captureMap, captureGuard, captureEq, lookAhead, lookAheadNot, | |
attach, orElse, tryOne, greedySome, lazySome, greedyMany, lazyMany, | |
times, | |
always, never, end, anything, value, notValue, range, anyOf, noneOf, lambda, | |
list, balanced, | |
numeric, hexadecimal, lowercase, uppercase, alpha, alphanumeric, ascii, | |
whitespace | |
) where | |
-- | The 'Match' type encapsulates a possible expression match. Expressions | |
-- always return a list of matches, from which, the first is the usually | |
-- required one. | |
-- Match captures match residue | |
data Match a = Match [[a]] [a] [a] deriving Show | |
-- | 'cmatch' creates a Match with no captures | |
cmatch :: [a] -> [a] -> Match a | |
cmatch = Match [] | |
-- | 'mprepend' prepends a character to a list of matches | |
mprepend :: a -> [Match a] -> [Match a] | |
mprepend d = map (\(Match c m r) -> Match c (d:m) r) | |
-- | 'captures' returns a Match's captures | |
captures :: Match a -> [[a]] | |
captures (Match c _ _) = c | |
-- | 'match' returns part of the string consumed by the pattern | |
match :: Match a -> [a] | |
match (Match _ m _) = m | |
-- | 'residue' returns a Match's residue, that is, part of the string not | |
-- consumed by the pattern yet | |
residue :: Match a -> [a] | |
residue (Match _ _ r) = r | |
-- | 'maybeCaptures' matches a pattern against a string, and returns captures | |
-- created by the least backtracked Match, if any | |
maybeCaptures :: Pattern a -> [a] -> Maybe [[a]] | |
maybeCaptures f ds = case f ds of | |
[] -> Nothing | |
(r:_) -> Just $ captures r | |
-- | 'maybeMatch' matches a pattern against a string, ignoring any captures, and | |
-- returns part of the string consumed by the least backtracked Match, if any | |
maybeMatch :: Pattern a -> [a] -> Maybe [a] | |
maybeMatch f ds = case f ds of | |
[] -> Nothing | |
(r:_) -> Just $ match r | |
-- | 'gsub f l d' finds all non-overlapping (least-backtraced) occurences of 'f' | |
-- in 'd', pipes their captures through 'l', and then puts back into 'd' in | |
-- respective places | |
gsub :: Pattern a -> ([[a]] -> [a]) -> [a] -> [a] | |
gsub _ _ [] = [] | |
gsub f l dd@(d:ds) = case f dd of | |
[] -> d:(gsub f l ds) | |
(Match c _ r:_) -> (l c) ++ (gsub f l r) | |
-- | The 'Pattern' type encapsulates an expression. It is a function, which, | |
-- when being fed an input string, returns a list of possible ways to parse that | |
-- input string, with the first attempted parse being first and so on. | |
type Pattern a = [a] -> [Match a] | |
-- ----------------------------------------------------------------------------- | |
-- Pattern operators | |
-- | 'capture f' captures whatever 'f' has matched | |
capture :: Pattern a -> Pattern a | |
capture f ds = map (\(Match c h t) -> (Match (h:c) h t)) $ f ds | |
-- | 'captureMap l f' changed 'f's captures using 'l' | |
captureMap :: ([[a]] -> [[a]]) -> Pattern a -> Pattern a | |
captureMap l f ds = map (\(Match c h t) -> Match (l c) h t) $ f ds | |
-- | 'captureGuard l f' filters out those matches of 'f', for which 'l' returns | |
-- True | |
captureGuard :: ([[a]] -> Bool) -> Pattern a -> Pattern a | |
captureGuard l f ds = filter (\(Match c _ _) -> l c) $ f ds | |
-- | 'captureEq a b f' filters those matches of 'f', for which a'th capture | |
-- equals b'th capture. If one of the required captures is not present, the | |
-- filter returns false. If both aren't, it returns True. | |
captureEq :: (Eq a) => Int -> Int -> Pattern a -> Pattern a | |
captureEq a b = captureGuard (\c -> (maybeIndex a c) == (maybeIndex b c)) | |
where | |
maybeIndex :: Int -> [a] -> Maybe a | |
maybeIndex _ [] = Nothing | |
maybeIndex 0 (x:_) = Just x | |
maybeIndex n (x:c) = maybeIndex (n - 1) c | |
-- | 'lookAhead f' matches empty string if 'f' matches | |
lookAhead :: Pattern a -> Pattern a | |
lookAhead f ds = case f ds of | |
[] -> [] | |
_ -> [cmatch [] ds] | |
-- | 'lookAheadNot f' matches empty string if 'f' does not match | |
lookAheadNot :: Pattern a -> Pattern a | |
lookAheadNot f ds = case f ds of | |
[] -> [cmatch [] ds] | |
_ -> [] | |
-- | 'f `attach` g' matches 'f' followed by 'g' | |
infixr 6 `attach` | |
attach :: Pattern a -> Pattern a -> Pattern a | |
attach f g ds = [ Match (c1 ++ c2) (m1 ++ m2) r2 | | |
Match c1 m1 r1 <- f ds, | |
Match c2 m2 r2 <- g r1] | |
-- | 'f `orElse` g' matches 'f' and in case of failure matches 'g' | |
infixr 5 `orElse` | |
orElse :: Pattern a -> Pattern a -> Pattern a | |
orElse f g ds = (f ds) ++ (g ds) | |
-- | 'tryOne f' attempts to match 'f' once, and upon failure to do so, matches | |
-- empty string | |
tryOne :: Pattern a -> Pattern a | |
tryOne f ds = (f ds) ++ [cmatch [] ds] | |
-- | 'greedySome f' attempts to match 'f' as many times as possible, then one | |
-- time less, and so on, excluding the empty match. Backtracks matches that | |
-- matched 0 characters. If the input list is infinite and therefore given | |
-- pattern matches infinitely, neither of possible results will ever evaluate | |
greedySome :: Pattern a -> Pattern a | |
greedySome f ds = case f ds of | |
[] -> [] | |
r -> [ Match (c1 ++ c2) (m1 ++ m2) r2 | | |
Match c1 m1 r1 <- r, not $ null m1, | |
Match c2 m2 r2 <- greedySome f r1] ++ r | |
-- | 'lazySome f' attempts to match 'f' 1 time, then 2 times, until it is | |
-- impossible to match any more. Backtracks matches that matched 0 characters. | |
lazySome :: Pattern a -> Pattern a | |
lazySome f ds = let r = f ds in | |
r ++ [ Match (c1 ++ c2) (m1 ++ m2) r2 | | |
Match c1 m1 r1 <- r, not $ null m1, | |
Match c2 m2 r2 <- lazySome f r1] | |
-- | 'greedyMany f' attempts to match 'f' as many times as possible, then one | |
-- time less, and so on, up to and including the empty match. Backtracks | |
-- matches that matched 0 characters. If the input list is infinite and | |
-- therefore given pattern matches infinitely, neither of possible results | |
-- will ever evaluate | |
greedyMany :: Pattern a -> Pattern a | |
greedyMany f ds = case f ds of | |
[] -> [cmatch [] ds] | |
r -> [ Match (c1 ++ c2) (m1 ++ m2) r2 | | |
Match c1 m1 r1 <- r, not $ null m1, | |
Match c2 m2 r2 <- greedyMany f r1] ++ [cmatch [] ds] | |
-- | 'lazyMany f' attempts to match 'f' 0 times, then 1 time, until it is | |
-- impossible to match any more. Backtracks matches that matched 0 characters. | |
lazyMany :: Pattern a -> Pattern a | |
lazyMany f ds = case f ds of | |
[] -> [cmatch [] ds] | |
r -> (cmatch [] ds) : [ Match (c1 ++ c2) (m1 ++ m2) r2 | | |
Match c1 m1 r1 <- r, not $ null m1, | |
Match c2 m2 r2 <- lazyMany f r1] | |
-- | 'times n f' matches 'f', repeated exactly 'n' times | |
times :: Int -> Pattern a -> Pattern a | |
times 0 _ = always | |
times n f = foldr1 attach $ replicate n f | |
-- ----------------------------------------------------------------------------- | |
-- Patterns | |
-- | 'always' matches an empty string | |
always :: Pattern a | |
always ds = [cmatch [] ds] | |
-- | 'never' never matches | |
never :: Pattern a | |
never _ = [] | |
-- | 'end' matches the end of input | |
end :: Pattern a | |
end [] = [cmatch [] []] | |
end _ = [] | |
-- | 'anything' matches any value in input's head | |
anything :: Pattern a | |
anything [] = [] | |
anything (d:ds) = [cmatch [d] ds] | |
-- | 'value x' matches input's head if it is 'x' | |
value :: (Eq a) => a -> Pattern a | |
value _ [] = [] | |
value v (d:ds) | |
| v == d = [cmatch [d] ds] | |
| otherwise = [] | |
-- | 'value x' matches input's head if it is not 'x' | |
notValue :: (Eq a) => a -> Pattern a | |
notValue _ [] = [] | |
notValue v (d:ds) | |
| v /= d = [cmatch [d] ds] | |
| otherwise = [] | |
-- | 'range a b' matches input's head if it lies within [a; b] | |
range :: (Ord a) => a -> a -> Pattern a | |
range _ _ [] = [] | |
range a b (d:ds) | |
| d >= a && d <= b = [cmatch [d] ds] | |
| otherwise = [] | |
-- | 'anyOf l' matches input's head if it is an element of 'l' | |
anyOf :: (Eq a) => [a] -> Pattern a | |
anyOf _ [] = [] | |
anyOf l (d:ds) | |
| elem d l = [cmatch [d] ds] | |
| otherwise = [] | |
-- | 'noneOf l' matches input's head if it is not an element of 'l' | |
noneOf :: (Eq a) => [a] -> Pattern a | |
noneOf _ [] = [] | |
noneOf l (d:ds) | |
| notElem d l = [cmatch [d] ds] | |
| otherwise = [] | |
-- | 'lambda l' matches input's head, if 'l' returns True for it | |
lambda :: (a -> Bool) -> Pattern a | |
lambda _ [] = [] | |
lambda l (d:ds) | |
| l d = [cmatch [d] ds] | |
| otherwise = [] | |
-- | 'list m' matches m, if the input begins with it | |
list :: (Eq a) => [a] -> Pattern a | |
list [] d = [cmatch [] d] | |
list _ [] = [] | |
list (v:vs) (d:ds) | |
| v == d = mprepend d $ list vs ds | |
| otherwise = [] | |
balanced :: (Eq a) => a -> a -> Pattern a | |
balanced a b (d:ds) | |
| a == d = mprepend d $ balance 1 a b ds | |
| otherwise = [] | |
where | |
balance 0 _ _ ds = [cmatch [] ds] | |
balance _ _ _ [] = [] | |
balance l a b (d:ds) = mprepend d $ balance (offset d a b l) a b ds | |
offset d a b l | |
| d == a = l + 1 | |
| d == b = l - 1 | |
| otherwise = l | |
-- ----------------------------------------------------------------------------- | |
-- Useful String-wise patterns | |
numeric = range '0' '9' | |
hexadecimal = numeric `orElse` range 'a' 'f' `orElse` range 'A' 'F' | |
lowercase = range 'a' 'z' | |
uppercase = range 'A' 'Z' | |
alpha = lowercase `orElse` uppercase | |
alphanumeric = alpha `orElse` numeric `orElse` anyOf "_-" | |
ascii = range ' ' '~' | |
whitespace = anyOf "\t\n\v\f\r " |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment