tel/ParserCombinators.hs

## ParserCombinators.hs
{-# LANGUAGE ConstraintKinds #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}

module ParserCombinators where

{-

We'll build a set of parser combinators from scratch demonstrating how
they arise as a monad transformer stack. Actually, how they arise as a
choice between two different monad transformer stacks!

-}

--------------------------------------------------------------------------------

import Control.Applicative
import Control.Monad
import Control.Monad.State

{-

Parser combinators are computations which, behind the scenes, consume
a stream of inbound characters producing /parsed partial results/ or
failing due to failing to match the grammar.

We'll describe the most basic interface in a typeclass. A parser
combinator type must be

1. a 'Functor' where 'fmap' changes the partial result without any side effect,
2. a 'Monad' and an 'Applicative' where '(>>=)'/'(<*>)' indicates /sequencing/,
3. an 'Alternative' where '(<|>)' indicates (left-biased) choice, and finally
4. an instance of 'Parses' defined below which gives us the most basic parser

-}

class Parses p where
  satisfy :: (Char -> Bool) -> p Char

-- We'll use ConstraintKinds to describe this overall set of
-- constraints.

type IsParser f
  = ( Parses f
    , Functor f
    , Applicative f
    , Alternative f
    , MonadPlus f  -- this is just for completeness, MonadPlus and
                   -- Alternative are the same thing!
    , Monad f
    )

-- And now we already can begin to write parser combinators, although,
-- honestly, each of these types is *far* too restrictive.

char :: Parses p => Char -> p Char
char c = satisfy (== c)

parensM :: (Parses m, Monad m) => m b -> m b
parensM p = do
  char '('
  res <- p
  char ')'
  return res

many1M :: MonadPlus p => p a -> p [a]
many1M p = do
  a  <- p
  as <- manyM p
  return (a:as)

manyM :: MonadPlus p => p a -> p [a]
manyM p = many1M p `mplus` return []

option :: Alternative p => p a -> p (Maybe a)
option p = fmap Just p <|> pure Nothing

choice :: Alternative p => [p a] -> p a
choice = foldr (<|>) empty

-- The @XM@ names are used to emphasize that these instances are
-- needlessly monadic.

parens :: (Parses p, Alternative p) => p b -> p b
parens p = (\_ a _ -> a) <$> char '(' <*> p <*> char ')'

many1 :: Alternative p => p a -> p [a]
many1 p = liftA2 (:) p (many p <|> pure [])

-- We'll eventually design our parser implementations to satisfy all
-- of 'IsParser', so since all of these combinators demand only
-- subsets of the constraints in 'IsParser' we'll ultimately be able
-- to use them all.

sepBy :: Alternative f => f a -> f s -> f [a]
sepBy p s = liftA2 (:) p ((s *> sepBy1 p s) <|> pure []) <|> pure []

sepBy1 :: Alternative f => f a -> f s -> f [a]
sepBy1 p s = scan where scan = liftA2 (:) p ((s *> scan) <|> pure [])

-- Also note that these are exactly the definitions used in real
-- parser combinator libraries like Attoparsec.

-- Because our parser combinator basis does not actually demand a
-- specific implementation of the parser type, we're also free to, for
-- instance, parse context sensitive grammars by stacking on
-- constraints like 'MonadState'

type IsContextSensitive s p = ( IsParser p, MonadState s p )

-- So let's implement this now.

--------------------------------------------------------------------------------

{-

Above we implemented a basis from which we can operate a
parser. Anything which satisfies the entire 'IsParser' interface
probably isn't lying. Let's examine two of these and see how they
arise as monad transformers.

The most basic parser is one which intakes an imput string and returns
'Maybe' the parsed interpretation of that string. That would nearly be
a perfect specification, except it doesn't talk about how to handle
"leftovers" if you only need to use part of the input to determine
whether or not this is a valid parse.

Instead of throwing leftovers away, we'll keep them around.

-}

runParser1 :: Parser1 a -> (String -> Maybe (a, String))
runParser1 (Parser1 go) inp = go inp

newtype Parser1 a = Parser1 (String -> Maybe (a, String))

{-

We can see that Parser1 implements the entire interface we need. I'll
just write that out, but we'll exmaine how it works in a little
bit. For now, just skip the following code and take it as material
proof that Parser1 is a sufficient design.

-}

instance Functor Parser1 where
  fmap f (Parser1 go) = Parser1 $ \inp -> do
    -- using the Maybe monad here
    (a, outp) <- go inp
    return (f a, outp)

instance Applicative Parser1 where
  pure = return
  p1 <*> p2 = do
    f <- p1
    x <- p2
    return (f x)

instance Alternative Parser1 where
  empty = Parser1 (\_ -> Nothing)
  p1 <|> p2 = Parser1 $ \inp ->
    case runParser1 p1 inp of
      Nothing -> runParser1 p2 inp
      Just x  -> Just x

instance Monad Parser1 where
  -- passes the input string straight through to the output
  return a = Parser1 (\inp -> return (a, inp))

  p >>= f = Parser1 $ \inp -> do
    (a, outp1) <- runParser1 p     inp
    (b, outp2) <- runParser1 (f a) outp1
    return (b, outp2)

instance Parses Parser1 where
  satisfy pred = Parser1 $ \inp ->
    case inp of
      []                 -> Nothing
      c : cs | pred c    -> Just (c, cs)
             | otherwise -> Nothing

{-

Phew!

So why does all of the code above work? Why should we believe that we
could ever implement this stuff?

Because if we look carefully at 'Parser1' we see that it's \"just\"
the same as 'Parser2'

-}

newtype Parser2 a =
  Parser2 (StateT String Maybe a)
  deriving ( Functor, Monad, Applicative, Alternative )

runParser2 :: Parser2 a -> String -> Maybe (a, String)
runParser2 (Parser2 go) inp = runStateT go inp

instance Parses Parser2 where
  satisfy pred = Parser2 $ do
    inp <- get
    case inp of
      []                 -> fail "empty input"
      c : cs | pred c    -> put cs >> return c
             | otherwise -> fail "satisfy"

{-

In other words, if we recognize that 'Parser1' is a monad transformer
stack of 'State' and 'Maybe' then all of the interfaces \"write
themselves\". In fact, if you go above and examine the interfaces
against the code for the implementation of 'State' then you'll see a
lot of similarities.

Which leads to the question: why are parsers equal to a combination of
'State' and 'Maybe' (failure)? Well, that seems to be a reasonable
description of what parsers do---we simply must restrict our notion of
state as being /causal/ in that we try to handle the present state
along a stream and throw it away when we're done: this is not at all
unlike @MonadState (Stream a)@.

This might make one wonder if we could change out the
components. And we can!

For instance, a /backtracking/ monadic parser combinator
implementation which satisfies all of the prior interface is just the
transformer stack of 'State' and '[]'.

-}

newtype Parser3 a =
  Parser3 { runParser3 :: StateT String [] a }
  deriving ( Functor, Monad, Applicative, Alternative )

-- Note that this is literally a copy-and-paste job from
-- above... That's because this implementation works for /any/
-- underlying monadic layer which has a notion of failure.
instance Parses Parser3 where
  satisfy pred = Parser3 $ do
    inp <- get
    case inp of
      []                 -> fail "empty input"
      c : cs | pred c    -> put cs >> return c
             | otherwise -> fail "satisfy"

{-

Here, we see that the @runX@ function denotes the idea that this
parser succeeds 0-or-more times just like we'd expect from a
non-deterministic, backtracking parse.

The list monad embodies depth-first search by default. What if we want
something breadth-first? Or if we want to ensure fairness in context
of an infinite space of potential parses?

We could rip out the '[]' layer and replace it with a monad from the
@logict@ or @omega@ packages, but instead let's just take advantage of
the idea that all we need to actually rely on is the /failure effect/:

-}

newtype Parser4 m a =
  Parser4 { runParser4 :: StateT String m a }
  deriving ( Functor, Monad, Applicative, Alternative )

instance FailWith m => Parses (Parser4 m) where
  satisfy pred = Parser4 $ do
    inp <- get
    case inp of
      []                 -> failWith "empty input"
      c : cs | pred c    -> put cs >> return c
             | otherwise -> failWith "satisfy"

-- FailWith just let's us be a bit more specific about why we're
-- failing. The 'fail' implementation built into 'Monad' is widely
-- considered to be a mistake since it often has really bad default
-- implementations---not all monads actually have a notion of failure!

class Monad m => FailWith m where
  failWith :: String -> m a

instance FailWith Maybe where
  failWith _ = Nothing

instance FailWith [] where
  failWith _ = []

instance FailWith m => FailWith (StateT s m) where
  failWith reason = lift (failWith reason)
	{-# LANGUAGE ConstraintKinds #-}
	{-# LANGUAGE GeneralizedNewtypeDeriving #-}

	module ParserCombinators where

	{-

	We'll build a set of parser combinators from scratch demonstrating how
	they arise as a monad transformer stack. Actually, how they arise as a
	choice between two different monad transformer stacks!

	-}

	--------------------------------------------------------------------------------

	import Control.Applicative
	import Control.Monad
	import Control.Monad.State

	{-

	Parser combinators are computations which, behind the scenes, consume
	a stream of inbound characters producing /parsed partial results/ or
	failing due to failing to match the grammar.

	We'll describe the most basic interface in a typeclass. A parser
	combinator type must be

	1. a 'Functor' where 'fmap' changes the partial result without any side effect,
	2. a 'Monad' and an 'Applicative' where '(>>=)'/'(<*>)' indicates /sequencing/,
	3. an 'Alternative' where '(<\|>)' indicates (left-biased) choice, and finally
	4. an instance of 'Parses' defined below which gives us the most basic parser

	-}

	class Parses p where
	satisfy :: (Char -> Bool) -> p Char

	-- We'll use ConstraintKinds to describe this overall set of
	-- constraints.

	type IsParser f
	= ( Parses f
	, Functor f
	, Applicative f
	, Alternative f
	, MonadPlus f -- this is just for completeness, MonadPlus and
	-- Alternative are the same thing!
	, Monad f
	)

	-- And now we already can begin to write parser combinators, although,
	-- honestly, each of these types is far too restrictive.

	char :: Parses p => Char -> p Char
	char c = satisfy (== c)

	parensM :: (Parses m, Monad m) => m b -> m b
	parensM p = do
	char '('
	res <- p
	char ')'
	return res

	many1M :: MonadPlus p => p a -> p [a]
	many1M p = do
	a <- p
	as <- manyM p
	return (a:as)

	manyM :: MonadPlus p => p a -> p [a]
	manyM p = many1M p `mplus` return []

	option :: Alternative p => p a -> p (Maybe a)
	option p = fmap Just p <\|> pure Nothing

	choice :: Alternative p => [p a] -> p a
	choice = foldr (<\|>) empty

	-- The @XM@ names are used to emphasize that these instances are
	-- needlessly monadic.

	parens :: (Parses p, Alternative p) => p b -> p b
	parens p = (\_ a _ -> a) <$> char '(' <> p <> char ')'

	many1 :: Alternative p => p a -> p [a]
	many1 p = liftA2 (:) p (many p <\|> pure [])

	-- We'll eventually design our parser implementations to satisfy all
	-- of 'IsParser', so since all of these combinators demand only
	-- subsets of the constraints in 'IsParser' we'll ultimately be able
	-- to use them all.

	sepBy :: Alternative f => f a -> f s -> f [a]
	sepBy p s = liftA2 (:) p ((s *> sepBy1 p s) <\|> pure []) <\|> pure []

	sepBy1 :: Alternative f => f a -> f s -> f [a]
	sepBy1 p s = scan where scan = liftA2 (:) p ((s *> scan) <\|> pure [])

	-- Also note that these are exactly the definitions used in real
	-- parser combinator libraries like Attoparsec.

	-- Because our parser combinator basis does not actually demand a
	-- specific implementation of the parser type, we're also free to, for
	-- instance, parse context sensitive grammars by stacking on
	-- constraints like 'MonadState'

	type IsContextSensitive s p = ( IsParser p, MonadState s p )

	-- So let's implement this now.

	--------------------------------------------------------------------------------

	{-

	Above we implemented a basis from which we can operate a
	parser. Anything which satisfies the entire 'IsParser' interface
	probably isn't lying. Let's examine two of these and see how they
	arise as monad transformers.

	The most basic parser is one which intakes an imput string and returns
	'Maybe' the parsed interpretation of that string. That would nearly be
	a perfect specification, except it doesn't talk about how to handle
	"leftovers" if you only need to use part of the input to determine
	whether or not this is a valid parse.

	Instead of throwing leftovers away, we'll keep them around.

	-}

	runParser1 :: Parser1 a -> (String -> Maybe (a, String))
	runParser1 (Parser1 go) inp = go inp

	newtype Parser1 a = Parser1 (String -> Maybe (a, String))

	{-

	We can see that Parser1 implements the entire interface we need. I'll
	just write that out, but we'll exmaine how it works in a little
	bit. For now, just skip the following code and take it as material
	proof that Parser1 is a sufficient design.

	-}

	instance Functor Parser1 where
	fmap f (Parser1 go) = Parser1 $ \inp -> do
	-- using the Maybe monad here
	(a, outp) <- go inp
	return (f a, outp)

	instance Applicative Parser1 where
	pure = return
	p1 <*> p2 = do
	f <- p1
	x <- p2
	return (f x)

	instance Alternative Parser1 where
	empty = Parser1 (\_ -> Nothing)
	p1 <\|> p2 = Parser1 $ \inp ->
	case runParser1 p1 inp of
	Nothing -> runParser1 p2 inp
	Just x -> Just x

	instance Monad Parser1 where
	-- passes the input string straight through to the output
	return a = Parser1 (\inp -> return (a, inp))

	p >>= f = Parser1 $ \inp -> do
	(a, outp1) <- runParser1 p inp
	(b, outp2) <- runParser1 (f a) outp1
	return (b, outp2)

	instance Parses Parser1 where
	satisfy pred = Parser1 $ \inp ->
	case inp of
	[] -> Nothing
	c : cs \| pred c -> Just (c, cs)
	\| otherwise -> Nothing

	{-

	Phew!

	So why does all of the code above work? Why should we believe that we
	could ever implement this stuff?

	Because if we look carefully at 'Parser1' we see that it's \"just\"
	the same as 'Parser2'

	-}

	newtype Parser2 a =
	Parser2 (StateT String Maybe a)
	deriving ( Functor, Monad, Applicative, Alternative )

	runParser2 :: Parser2 a -> String -> Maybe (a, String)
	runParser2 (Parser2 go) inp = runStateT go inp

	instance Parses Parser2 where
	satisfy pred = Parser2 $ do
	inp <- get
	case inp of
	[] -> fail "empty input"
	c : cs \| pred c -> put cs >> return c
	\| otherwise -> fail "satisfy"

	{-

	In other words, if we recognize that 'Parser1' is a monad transformer
	stack of 'State' and 'Maybe' then all of the interfaces \"write
	themselves\". In fact, if you go above and examine the interfaces
	against the code for the implementation of 'State' then you'll see a
	lot of similarities.

	Which leads to the question: why are parsers equal to a combination of
	'State' and 'Maybe' (failure)? Well, that seems to be a reasonable
	description of what parsers do---we simply must restrict our notion of
	state as being /causal/ in that we try to handle the present state
	along a stream and throw it away when we're done: this is not at all
	unlike @MonadState (Stream a)@.

	This might make one wonder if we could change out the
	components. And we can!

	For instance, a /backtracking/ monadic parser combinator
	implementation which satisfies all of the prior interface is just the
	transformer stack of 'State' and '[]'.

	-}

	newtype Parser3 a =
	Parser3 { runParser3 :: StateT String [] a }
	deriving ( Functor, Monad, Applicative, Alternative )

	-- Note that this is literally a copy-and-paste job from
	-- above... That's because this implementation works for /any/
	-- underlying monadic layer which has a notion of failure.
	instance Parses Parser3 where
	satisfy pred = Parser3 $ do
	inp <- get
	case inp of
	[] -> fail "empty input"
	c : cs \| pred c -> put cs >> return c
	\| otherwise -> fail "satisfy"

	{-

	Here, we see that the @runX@ function denotes the idea that this
	parser succeeds 0-or-more times just like we'd expect from a
	non-deterministic, backtracking parse.

	The list monad embodies depth-first search by default. What if we want
	something breadth-first? Or if we want to ensure fairness in context
	of an infinite space of potential parses?

	We could rip out the '[]' layer and replace it with a monad from the
	@logict@ or @omega@ packages, but instead let's just take advantage of
	the idea that all we need to actually rely on is the /failure effect/:

	-}

	newtype Parser4 m a =
	Parser4 { runParser4 :: StateT String m a }
	deriving ( Functor, Monad, Applicative, Alternative )

	instance FailWith m => Parses (Parser4 m) where
	satisfy pred = Parser4 $ do
	inp <- get
	case inp of
	[] -> failWith "empty input"
	c : cs \| pred c -> put cs >> return c
	\| otherwise -> failWith "satisfy"

	-- FailWith just let's us be a bit more specific about why we're
	-- failing. The 'fail' implementation built into 'Monad' is widely
	-- considered to be a mistake since it often has really bad default
	-- implementations---not all monads actually have a notion of failure!

	class Monad m => FailWith m where
	failWith :: String -> m a

	instance FailWith Maybe where
	failWith _ = Nothing

	instance FailWith [] where
	failWith _ = []

	instance FailWith m => FailWith (StateT s m) where
	failWith reason = lift (failWith reason)