Last active
May 19, 2020 17:44
-
-
Save gelisam/bca49348986f119457837cdbfa746cd4 to your computer and use it in GitHub Desktop.
prototype parser with an associative <|>
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- A prototype demonstrating a possible fix for https://github.com/mrkkrp/megaparsec/issues/412 | |
{-# LANGUAGE LambdaCase, TemplateHaskell #-} | |
{-# OPTIONS -Wno-name-shadowing #-} | |
module Main where | |
import Test.DocTest | |
import Control.Lens | |
import Control.Monad.Trans.Class | |
import Control.Monad.Trans.Maybe | |
import Control.Monad.Reader | |
import Control.Monad.State | |
import Data.List | |
import Data.Map (Map) | |
import qualified Data.Map as Map | |
-- | | |
-- As https://github.com/mrkkrp/megaparsec/issues/412#issuecomment-626441968 | |
-- explains, the problem is that sometimes the error information about early | |
-- positions is dropped in favor of error information about later positions, | |
-- while at other times it is the error information about the later positions | |
-- which is dropped. There are thus two obvious ways to solve the problem: | |
-- | |
-- 1. Always prefer the information about the early position. | |
-- 2. Always prefer the information about the later position. | |
-- | |
-- Unfortunately, both approaches are flawed. Here's what we get with approach 1: | |
-- | |
-- > parseTest ((char 'a' <|> char 'b') >> char 'c') "bz" | |
-- expected 'a', found "bz" | |
-- | |
-- The error is very misleading, it's an old error which would have been | |
-- relevant if the parser had failed at position 0, but since the parser did | |
-- succeed at consuming character 'b', we would prefer to get an error about 'c' | |
-- instead. | |
-- | |
-- A slightly less naïve variant is to always prefer the information about the | |
-- character which would be consumed next. Here's what we get with that | |
-- approach: | |
-- | |
-- > parseTest ((char 'a' <|> char 'b') >> char 'c') "bz" | |
-- expected 'c', found "z" | |
-- > parseTest (try "world" (mapM_ char "world") <|> mapM_ char "universe") "wold" | |
-- expected world or 'u', found "wold" | |
-- | |
-- This is much better, but since it's pretty obvious that the world branch is | |
-- the successful one, we might prefer an error which points inside | |
-- that branch instead of an error which merely says that a correctly-parsing | |
-- world was expected. The more complex the parsing logic inside the branch, the | |
-- more important this becomes. | |
-- | |
-- Approach 2 solves that problem, but has problems of its own: | |
-- | |
-- > parseTest ((char 'a' <|> char 'b') >> char 'c') "bz" | |
-- expected 'c', found "z" | |
-- > parseTest (try "world" (mapM_ char "world") <|> mapM_ char "universe") "wold" | |
-- expected 'r', found "l" | |
-- > parseTest (try "world" (mapM_ char "hello world") <|> mapM_ char "hello universe") "hello wombat" | |
-- expected 'r', found "mbat" | |
-- | |
-- This time, we don't want the hello branch's error because the universe branch | |
-- successfully consumes some characters, which means the parser is quite | |
-- confident that this is the correct branch and thus error messages should be | |
-- about what happens in that branch, not the discarded hello branch. | |
-- | |
-- | |
-- In this prototype, I demonstrate a third approach which combines the less | |
-- naïve variant of approach 1 with approach 2. Here are the results it gives: | |
-- | |
-- >>> parseTest ((char 'a' <|> char 'b') >> char 'c') "bz" | |
-- expected 'c', found "z" | |
-- >>> parseTest (try "world" (mapM_ char "world") <|> mapM_ char "universe") "wold" | |
-- expected world or 'u', found "wold" | |
-- in the most promising alternative(s): | |
-- expected 'r', found "ld" | |
-- >>> parseTest (try "world" (mapM_ char "hello world") <|> mapM_ char "hello universe") "hello wombat" | |
-- expected 'u', found "wombat" | |
-- | |
-- For example 1, my prototype correctly mentions 'c', not 'a'. For example 3, | |
-- my prototype correctly gives errors about the universe branch, not the | |
-- discarded world branch. | |
-- | |
-- Example 2 is more complicated. Since neither branch consumed any characters, | |
-- it's not clear which branch the error message should be about, so I first | |
-- list all those branches. But just in case the branch which observed the most | |
-- characters is indeed the one the user had in mind, I also give the error | |
-- message from that branch. Or branches, if there is a tie: | |
-- | |
-- >>> parseTest (try "world" (mapM_ char "world") <|> try "wombat" (mapM_ char "wombat") <|> mapM_ char "universe") "wold" | |
-- expected world, wombat, or 'u', found "wold" | |
-- in the most promising alternative(s): | |
-- expected 'r' or 'm', found "ld" | |
-- My implementation strategy is simply to keep track of two error messages, | |
-- updating and clearing them when appropriate. | |
data ParseState = ParseState | |
{ _currentPosition :: Int | |
, _consuming :: Bool | |
, _consumedPosition :: Int | |
, _consumedHints :: [String] | |
, _observedPosition :: Int | |
, _observedHints :: [String] | |
} deriving Show | |
makeLenses ''ParseState | |
initialParseState :: ParseState | |
initialParseState = ParseState | |
{ _currentPosition = 0 | |
, _consuming = True | |
, _consumedPosition = 0 | |
, _consumedHints = [] | |
, _observedPosition = 0 | |
, _observedHints = [] | |
} | |
-- Later, I want to prove that (<|>) is associative. I thus use monad | |
-- transformers so I can rely on Monad's associativity law. | |
type ProtoParser = ReaderT (Int -> Maybe Char) | |
(State ParseState) | |
type Parser = MaybeT ProtoParser | |
runParser :: Parser a | |
-> String -> Either String a | |
runParser p input = finishUp | |
. flip runState initialParseState | |
. flip runReaderT charAt | |
. runMaybeT | |
$ p | |
where | |
charMap :: Map Int Char | |
charMap = Map.fromList $ zip [0..] input | |
charAt :: Int -> Maybe Char | |
charAt i = Map.lookup i charMap | |
expected :: [String] -> String | |
expected [] = "something else" | |
expected [s] = s | |
expected [s1, s2] = s1 ++ " or " ++ s2 | |
expected ss = intercalate ", " (init ss) ++ ", or " ++ last ss | |
found :: Int -> String | |
found i = case charAt i of | |
Nothing -> "<eof>" | |
Just _ -> toListOf (each . to (+ i) . to charAt . _Just) [0..10] | |
generateError :: Int -> [String] -> String | |
generateError i alts = "expected " ++ expected alts | |
++ ", found " ++ show (found i) | |
finishUp :: (Maybe a, ParseState) -> Either String a | |
finishUp (Just a, _) = pure a | |
finishUp (Nothing, s) = do | |
let msg1 = generateError (s ^. consumedPosition) (s ^. consumedHints) | |
let msg2 = generateError (s ^. observedPosition) (s ^. observedHints) | |
if msg1 == msg2 | |
then Left msg1 | |
else Left $ msg1 ++ "\n" | |
++ "in the most promising alternative(s):\n" | |
++ msg2 | |
parseTest :: Show a | |
=> Parser a -> String -> IO () | |
parseTest p input = do | |
case runParser (p <* eof) input of | |
Left e -> do | |
putStrLn e | |
Right a -> do | |
print a | |
currentChar :: ProtoParser (Maybe Char) | |
currentChar = do | |
f <- ask | |
i <- use currentPosition | |
pure $ f i | |
stepForward :: ProtoParser () | |
stepForward = do | |
i <- use currentPosition | |
o <- use observedPosition | |
let i' = i + 1 | |
currentPosition .= i' | |
isConsuming <- use consuming | |
when isConsuming $ do | |
consumedPosition .= i' | |
consumedHints .= [] | |
-- when we consume a character, we commit to this branch and thus forget | |
-- the error messages from all prior branches. | |
observedPosition .= i' | |
observedHints .= [] | |
when (i' > o) $ do | |
-- this branch has gone further than all the prior branches; clear their | |
-- error messages so that 'observedHints' only contains hints about this | |
-- branch. | |
observedPosition .= i' | |
observedHints .= [] | |
addHint :: String -> ProtoParser () | |
addHint hint = do | |
i <- use currentPosition | |
c <- use consumedPosition | |
o <- use observedPosition | |
when (i == c) $ do | |
consumedHints <>= [hint] | |
when (i == o) $ do | |
observedHints <>= [hint] | |
char :: Char -> Parser () | |
char expected = do | |
actual <- lift currentChar | |
if actual == Just expected | |
then do | |
lift stepForward | |
pure () | |
else do | |
lift $ addHint $ show expected | |
empty | |
eof :: Parser () | |
eof = do | |
actual <- lift currentChar | |
if actual == Nothing | |
then do | |
pure () | |
else do | |
lift $ addHint "<eof>" | |
empty | |
empty :: Parser a | |
empty = MaybeT $ do | |
pure Nothing | |
(<|>) :: Parser a -> Parser a -> Parser a | |
p1 <|> p2 = MaybeT $ do | |
i0 <- use currentPosition | |
runMaybeT p1 >>= \case | |
Just a -> do | |
pure $ Just a | |
Nothing -> do | |
i1 <- use currentPosition | |
-- only consider the second branch if the first branch did not consume | |
-- any characters. | |
if i1 == i0 | |
then runMaybeT p2 | |
else pure Nothing | |
-- for simplicity, instead of naming branches using (<?>), in this prototype | |
-- every 'try' has a name. | |
try :: String -> Parser a -> Parser a | |
try name p = MaybeT $ do | |
i <- use currentPosition | |
isConsuming <- use consuming | |
consuming .= False | |
r <- runMaybeT p | |
consuming .= isConsuming | |
case r of | |
Nothing -> do | |
currentPosition .= i | |
addHint name | |
pure Nothing | |
Just a -> do | |
pure $ Just a | |
-- let's now recreate the test-case from #412 | |
a :: Parser () | |
a = try "a" (char 'a' >> char 'z') | |
b :: Parser () | |
b = char 'b' | |
c :: Parser () | |
c = pure () | |
-- | | |
-- >>> parseTest (leftAssociative >> char 'd') "aaa" | |
-- expected a, 'b', or 'd', found "aaa" | |
-- in the most promising alternative(s): | |
-- expected 'z', found "aa" | |
leftAssociative :: Parser () | |
leftAssociative = (a <|> b) <|> c | |
-- | | |
-- >>> parseTest (rightAssociative >> char 'd') "aaa" | |
-- expected a, 'b', or 'd', found "aaa" | |
-- in the most promising alternative(s): | |
-- expected 'z', found "aa" | |
rightAssociative :: Parser () | |
rightAssociative = a <|> (b <|> c) | |
-- Hurray, they give the same result! But is this true in general, or only for | |
-- this example? It is true in general, as I now show in this final section, by | |
-- using equational reasoning to prove that (<|>) is associative. | |
-- | |
-- Instead of rewriting (p1 <|> (p2 <|> p3)) to ((p1 <|> p2) <|> p3), which | |
-- would require simplifying (p1 <|> (p2 <|> p3)) to a simplest representation | |
-- and then complexifying it back to ((p1 <|> p2) <|> p3), I will rewrite both | |
-- (p1 <|> (p2 <|> p3)) and ((p1 <|> p2) <|> p3) to an identical simplified | |
-- representation. | |
-- | |
-- | |
-- > p1 <|> (p2 <|> p3) | |
-- | |
-- by definition | |
-- | |
-- > MaybeT $ do | |
-- > i0 <- use currentPosition | |
-- > runMaybeT p1 >>= \case | |
-- > Just a -> do | |
-- > pure $ Just a | |
-- > Nothing -> do | |
-- > i1 <- use currentPosition | |
-- > if i1 == i0 | |
-- > then do -- p2 <|> p3 | |
-- > i1 <- use currentPosition | |
-- > runMaybeT p2 >>= \case | |
-- > Just a -> do | |
-- > pure $ Just a | |
-- > Nothing -> do | |
-- > i2 <- use currentPosition | |
-- > if i2 == i1 | |
-- > then runMaybeT p3 | |
-- > else pure Nothing | |
-- > else pure Nothing | |
-- | |
-- the position doesn't change between the two 'i1's | |
-- | |
-- > MaybeT $ do | |
-- > i0 <- use currentPosition | |
-- > runMaybeT p1 >>= \case | |
-- > Just a -> do | |
-- > pure $ Just a | |
-- > Nothing -> do | |
-- > i1 <- use currentPosition | |
-- > if i1 == i0 | |
-- > then do -- p2 <|> p3 | |
-- > runMaybeT p2 >>= \case | |
-- > Just a -> do | |
-- > pure $ Just a | |
-- > Nothing -> do | |
-- > i2 <- use currentPosition | |
-- > if i2 == i1 | |
-- > then runMaybeT p3 | |
-- > else pure Nothing | |
-- > else pure Nothing | |
-- | |
-- 'i0' and 'i1' are equal in the then branch | |
-- | |
-- > MaybeT $ do | |
-- > i0 <- use currentPosition | |
-- > runMaybeT p1 >>= \case | |
-- > Just a -> do | |
-- > pure $ Just a | |
-- > Nothing -> do | |
-- > i1 <- use currentPosition | |
-- > if i1 == i0 | |
-- > then do -- p2 <|> p3 | |
-- > runMaybeT p2 >>= \case | |
-- > Just a -> do | |
-- > pure $ Just a | |
-- > Nothing -> do | |
-- > i2 <- use currentPosition | |
-- > if i2 == i0 | |
-- > then runMaybeT p3 | |
-- > else pure Nothing | |
-- > else pure Nothing | |
-- | |
-- | |
-- > (p1 <|> p2) <|> p3 | |
-- | |
-- by definition | |
-- | |
-- > MaybeT $ do | |
-- > i0 <- use currentPosition | |
-- > r2 <- do -- p1 <|> p2 | |
-- > i0 <- use currentPosition | |
-- > runMaybeT p1 >>= \case | |
-- > Just a -> do | |
-- > pure $ Just a | |
-- > Nothing -> do | |
-- > i1 <- use currentPosition | |
-- > if i1 == i0 | |
-- > then runMaybeT p2 | |
-- > else pure Nothing | |
-- > case r2 | |
-- > Just a -> do | |
-- > pure $ Just a | |
-- > Nothing -> do | |
-- > i2 <- use currentPosition | |
-- > if i2 == i0 | |
-- > then runMaybeT p3 | |
-- > else pure Nothing | |
-- | |
-- flatten the nested do | |
-- | |
-- > MaybeT $ do | |
-- > i0 <- use currentPosition | |
-- > i0 <- use currentPosition | |
-- > r2 <- runMaybeT p1 >>= \case | |
-- > Just a -> do | |
-- > pure $ Just a | |
-- > Nothing -> do | |
-- > i1 <- use currentPosition | |
-- > if i1 == i0 | |
-- > then runMaybeT p2 | |
-- > else pure Nothing | |
-- > case r2 | |
-- > Just a -> do | |
-- > pure $ Just a | |
-- > Nothing -> do | |
-- > i2 <- use currentPosition | |
-- > if i2 == i0 | |
-- > then runMaybeT p3 | |
-- > else pure Nothing | |
-- | |
-- deduplicate the 'use' and inline the 'r2' continuation | |
-- | |
-- > MaybeT $ do | |
-- > i0 <- use currentPosition | |
-- > r2 <- runMaybeT p1 >>= \case | |
-- > Just a -> do | |
-- > let r2 = Just a | |
-- > case r2 | |
-- > Just a -> do | |
-- > pure $ Just a | |
-- > Nothing -> do | |
-- > i2 <- use currentPosition | |
-- > if i2 == i0 | |
-- > then runMaybeT p3 | |
-- > else pure Nothing | |
-- > Nothing -> do | |
-- > i1 <- use currentPosition | |
-- > if i1 == i0 | |
-- > then do | |
-- > r2 <- runMaybeT p2 | |
-- > case r2 | |
-- > Just a -> do | |
-- > pure $ Just a | |
-- > Nothing -> do | |
-- > i2 <- use currentPosition | |
-- > if i2 == i0 | |
-- > then runMaybeT p3 | |
-- > else pure Nothing | |
-- > else do | |
-- > let r2 = Nothing | |
-- > case r2 | |
-- > Just a -> do | |
-- > pure $ Just a | |
-- > Nothing -> do | |
-- > i2 <- use currentPosition | |
-- > if i2 == i0 | |
-- > then runMaybeT p3 | |
-- > else pure Nothing | |
-- | |
-- specialize case expressions | |
-- | |
-- > MaybeT $ do | |
-- > i0 <- use currentPosition | |
-- > runMaybeT p1 >>= \case | |
-- > Just a -> do | |
-- > pure $ Just a | |
-- > Nothing -> do | |
-- > i1 <- use currentPosition | |
-- > if i1 == i0 | |
-- > then do | |
-- > runMaybeT p2 >>= \case | |
-- > Just a -> do | |
-- > pure $ Just a | |
-- > Nothing -> do | |
-- > i2 <- use currentPosition | |
-- > if i2 == i0 | |
-- > then runMaybeT p3 | |
-- > else pure Nothing | |
-- > else do | |
-- > i2 <- use currentPosition | |
-- > if i2 == i0 | |
-- > then runMaybeT p3 | |
-- > else pure Nothing | |
-- | |
-- the position doesn't change in the else branch | |
-- | |
-- > MaybeT $ do | |
-- > i0 <- use currentPosition | |
-- > runMaybeT p1 >>= \case | |
-- > Just a -> do | |
-- > pure $ Just a | |
-- > Nothing -> do | |
-- > i1 <- use currentPosition | |
-- > if i1 == i0 | |
-- > then do | |
-- > runMaybeT p2 >>= \case | |
-- > Just a -> do | |
-- > pure $ Just a | |
-- > Nothing -> do | |
-- > i2 <- use currentPosition | |
-- > if i2 == i0 | |
-- > then runMaybeT p3 | |
-- > else pure Nothing | |
-- > else do | |
-- > pure Nothing | |
main :: IO () | |
main = do | |
doctest ["src/Main.hs"] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment