LightAndLight/lex-parse-megaparsec.lhs

## lex-parse-megaparsec.lhs
\begin{code}

{-# LANGUAGE TypeFamilies #-}
module Parser where

import Control.Applicative    ((*>), (<|>))
import Control.Monad          (void)
import Data.Functor           (($>))
import Data.List.NonEmpty     (NonEmpty(..))
import Data.Maybe             (fromJust)
import Data.Semigroup         ((<>))
import Text.Megaparsec        (Dec, Parsec, Pos, ShowToken(..),
                               SourcePos(..), alphaNumChar, between, char,
                               choice, eof, lookAhead, mkPos, skipMany, skipSome, some,
                               spaceChar, statePos, string, try)
import Text.Megaparsec.Prim   (MonadParsec)
import Text.Megaparsec.String (Parser)

import qualified Text.Megaparsec       as Error (ErrorItem(..))
import qualified Text.Megaparsec.Prim  as Prim  (Stream(..), getParserState, token)
import qualified Data.List.NonEmpty    as N
import qualified Data.Set              as S
import qualified Text.Megaparsec.Lexer as L

\end{code}


A `Token` has a source position and a type.


\begin{code}

data Token
  = Token
  { tokenPos :: SourcePos
  , tokenType  :: TokenType
  } deriving (Eq, Ord, Show)

\end{code}


`Token` types. End of file has its own token because we will need to express when
a valid program is allowed to end.


\begin{code}

data TokenType
  = Begin
  | End
  | Subsection
  | Semicolon
  | Identifier String
  | Eof
  deriving (Eq, Ord, Show)

\end{code}


Each type of token has a size, which represents the size of the word it was parsed from.
This will be used later to generate nice error messages.


\begin{code}

tokenTypeSize :: TokenType -> Pos
tokenTypeSize t =
  fromJust . mkPos $ case t of
    Begin -> length "begin"
    End -> length "end"
    Subsection -> length "subsection"
    Semicolon -> length ";"
    Identifier str -> length str
    Eof -> 1

\end{code}


Parser to skip whitespace and comments:


\begin{code}

skipWhitespace :: Parser ()
skipWhitespace =
  L.space
    (void spaceChar)
    (L.skipLineComment "//")
    (L.skipBlockComment "/*" "*/")

\end{code}


Combinator to easily add the current source position to a `Token`:


\begin{code}

withPos
  :: MonadParsec e s m
  => (SourcePos -> m a)
  -> m a
withPos f = (N.head . statePos <$> Prim.getParserState) >>= f

\end{code}


Combinators that skip whitespace after tokens are parsed.

`delimiter` is a parser for characters that can follow a keyword.

`keyword` will only parse its argument if it's followed by a `delimiter`. This allows
`Identifier`s to contain keywords, but still be parsed correctly.


\begin{code}

delimiter :: Parser ()
delimiter = choice [void $ char ';', void spaceChar, eof]

lexeme :: Parser a -> Parser a
lexeme = L.lexeme skipWhitespace

keyword :: String -> Parser String
keyword p = lexeme (try $ string p <* lookAhead delimiter)

symbol :: String -> Parser String
symbol = L.symbol skipWhitespace

\end{code}


Parsers for each `Token` constructor:


\begin{code}

beginToken = withPos $ \pos -> Token pos <$>
  (keyword "begin" $> Begin)

endToken = withPos $ \pos -> Token pos <$>
  (keyword "end" $> End)

subsectionToken = withPos $ \pos -> Token pos <$>
  (keyword "subsection" $> Subsection)

semicolonToken = withPos $ \pos -> Token pos <$>
  (symbol ";" $> Semicolon)

identifierToken = withPos $ \pos -> Token pos .
  Identifier <$> lexeme (some alphaNumChar)

eofToken = withPos $ \pos -> Token pos <$>
  (eof $> Eof)

\end{code}


Parser for a single `Token`.

`Eof` is not in this list because it should only ever be parsed once. If we added it
to the list then the parser would loop forever, continually parsing `Eof` tokens.


\begin{code}

token :: Parser Token
token = choice
  [ beginToken
  , endToken
  , subsectionToken
  , semicolonToken
  , identifierToken
  ]

\end{code}


Parser for a sequence of `Token`s.

* Skip any whitespace at the beginning of the input
* Parse as `token`s as possible
* Get the `Eof` token and its position
* Return the full sequence


\begin{code}

tokens :: Parser [Token]
tokens = do
  skipWhitespace
  toks <- some token
  end <- eofToken
  pure $ toks ++ [end]

\end{code}


That's the lexing phase done. Now some setup for parsing.

Pretty printing for `Token`s. This allows `parseErrorPretty` to generate nice errors.


\begin{code}

instance ShowToken Token where
  showTokens ts = foldMap (prettyToken . tokenType) $ N.toList ts
    where
      quoted s = "\'" ++ s ++ "\'"

      prettyToken :: TokenType -> String
      prettyToken t =
        case t of
          Begin -> quoted "begin"
          End -> quoted "end"
          Subsection -> quoted "subsection"
          Semicolon -> quoted ";"
          Identifier s -> "identifier " ++ quoted s
          Eof -> "end of file"

\end{code}


Setup that tells the parser how to consume a stream of our tokens.
`updatePos` is important because it allows for helpful reporting of error messages.


\begin{code}

newtype TokenStream = TokenStream { getTokenStream :: [Token] }

instance Prim.Stream TokenStream where
  type Token TokenStream = Token

  uncons (TokenStream toks) =
    case toks of
      [] -> Nothing
      (t:ts) -> Just (t, TokenStream ts)

  updatePos _ _ sourcePos tok =
    let
      pos = tokenPos tok
      size = tokenTypeSize (tokenType tok)
    in
    ( sourcePos
    , pos { sourceColumn = sourceColumn pos <> size }
    )

\end{code}


Primitives for parsing a stream of `Token`s.

`exact` asserts that the next token in the stream has the same `TokenType` as its
argument.

`identifier` asserts that the next token in the stream is an `Identifier`, and then
returns the identifier string if it succeeds.


\begin{code}

type ProgramParser = Parsec Dec TokenStream

exact :: TokenType -> ProgramParser ()
exact t = Prim.token isToken Nothing
  where
    isToken t'
      | t == tokenType t' = Right ()
      | otherwise = Left
        ( S.singleton . Error.Tokens $ pure t'
        , S.singleton . Error.Tokens $ pure t' { tokenType = t }
        , S.empty
        )

identifier :: ProgramParser String
identifier = Prim.token isIdent Nothing
  where
    isIdent t =
      case tokenType t of
        (Identifier str) -> Right str
        _ -> Left
          ( S.singleton . Error.Label $ N.fromList "Identifier"
          , S.empty
          , S.empty
          )

\end{code}


Program AST:


\begin{code}

data Program
  = Program [Block]
  deriving (Eq, Show)

data Block
  = Block [Block]
  | Field String String
  deriving (Eq, Show)

\end{code}


Program grammar:


\begin{code}

semicolon :: ProgramParser ()
semicolon = exact Semicolon

begin :: ProgramParser ()
begin = exact Begin

end :: ProgramParser ()
end = exact End *> semicolon

endSubsection :: ProgramParser ()
endSubsection =
  exact End *>
  exact Subsection *>
  semicolon

field :: ProgramParser Block
field = Field <$> identifier <*> identifier

block :: ProgramParser Block
block =
  Block <$> between begin endSubsection (some block) <|>
  field

program :: ProgramParser Program
program =
  (Program <$> between begin end (some block)) <*
  exact Eof

\end{code}

## lex-parse-megaparsec.md

      
    Raw
  

              lex-parse-megaparsec.md
            
          
{-# LANGUAGE TypeFamilies #-}
module Parser where

import Control.Applicative    ((*>), (<|>))
import Control.Monad          (void)
import Data.Functor           (($>))
import Data.List.NonEmpty     (NonEmpty(..))
import Data.Maybe             (fromJust)
import Data.Semigroup         ((<>))
import Text.Megaparsec        (Dec, Parsec, Pos, ShowToken(..),
                               SourcePos(..), alphaNumChar, between, char,
                               choice, eof, lookAhead, mkPos, skipMany, skipSome, some,
                               spaceChar, statePos, string, try)
import Text.Megaparsec.Prim   (MonadParsec)
import Text.Megaparsec.String (Parser)

import qualified Text.Megaparsec       as Error (ErrorItem(..))
import qualified Text.Megaparsec.Prim  as Prim  (Stream(..), getParserState, token)
import qualified Data.List.NonEmpty    as N
import qualified Data.Set              as S
import qualified Text.Megaparsec.Lexer as L

A Token has a source position and a type.

data Token
  = Token
  { tokenPos :: SourcePos
  , tokenType  :: TokenType
  } deriving (Eq, Ord, Show)

Token types. End of file has its own token because we will need to express when a valid program is allowed to end.

data TokenType
  = Begin
  | End
  | Subsection
  | Semicolon
  | Identifier String
  | Eof
  deriving (Eq, Ord, Show)

Each type of token has a size, which represents the size of the word it was parsed from. This will be used later to generate nice error messages.

tokenTypeSize :: TokenType -> Pos
tokenTypeSize t =
  fromJust . mkPos $ case t of
    Begin -> length "begin"
    End -> length "end"
    Subsection -> length "subsection"
    Semicolon -> length ";"
    Identifier str -> length str
    Eof -> 1

Parser to skip whitespace and comments:

skipWhitespace :: Parser ()
skipWhitespace =
  L.space
    (void spaceChar)
    (L.skipLineComment "//")
    (L.skipBlockComment "/*" "*/")

Combinator to easily add the current source position to a Token:

withPos
  :: MonadParsec e s m
  => (SourcePos -> m a)
  -> m a
withPos f = (N.head . statePos <$> Prim.getParserState) >>= f

Combinators that skip whitespace after tokens are parsed.
delimiter is a parser for characters that can follow a keyword.
keyword will only parse its argument if it's followed by a delimiter. This allows Identifiers to contain keywords, but still be parsed correctly.

delimiter :: Parser ()
delimiter = choice [void $ char ';', void spaceChar, eof]

lexeme :: Parser a -> Parser a
lexeme = L.lexeme skipWhitespace

keyword :: String -> Parser String
keyword p = lexeme (try $ string p <* lookAhead delimiter)

symbol :: String -> Parser String
symbol = L.symbol skipWhitespace

Parsers for each Token constructor:

beginToken = withPos $ \pos -> Token pos <$>
  (keyword "begin" $> Begin)

endToken = withPos $ \pos -> Token pos <$>
  (keyword "end" $> End)

subsectionToken = withPos $ \pos -> Token pos <$>
  (keyword "subsection" $> Subsection)

semicolonToken = withPos $ \pos -> Token pos <$>
  (symbol ";" $> Semicolon)

identifierToken = withPos $ \pos -> Token pos .
  Identifier <$> lexeme (some alphaNumChar)

eofToken = withPos $ \pos -> Token pos <$>
  (eof $> Eof)

Parser for a single Token.
Eof is not in this list because it should only ever be parsed once. If we added it to the list then the parser would loop forever, continually parsing Eof tokens.

token :: Parser Token
token = choice
  [ beginToken
  , endToken
  , subsectionToken
  , semicolonToken
  , identifierToken
  ]

Parser for a sequence of Tokens.

Skip any whitespace at the beginning of the input
Parse as tokens as possible
Get the Eof token and its position
Return the full sequence


tokens :: Parser [Token]
tokens = do
  skipWhitespace
  toks <- some token
  end <- eofToken
  pure $ toks ++ [end]

That's the lexing phase done. Now some setup for parsing.
Pretty printing for Tokens. This allows parseErrorPretty to generate nice errors.

instance ShowToken Token where
  showTokens ts = foldMap (prettyToken . tokenType) $ N.toList ts
    where
      quoted s = "\'" ++ s ++ "\'"

      prettyToken :: TokenType -> String
      prettyToken t =
        case t of
          Begin -> quoted "begin"
          End -> quoted "end"
          Subsection -> quoted "subsection"
          Semicolon -> quoted ";"
          Identifier s -> "identifier " ++ quoted s
          Eof -> "end of file"

Setup that tells the parser how to consume a stream of our tokens. updatePos is important because it allows for helpful reporting of error messages.

newtype TokenStream = TokenStream { getTokenStream :: [Token] }

instance Prim.Stream TokenStream where
  type Token TokenStream = Token

  uncons (TokenStream toks) =
    case toks of
      [] -> Nothing
      (t:ts) -> Just (t, TokenStream ts)

  updatePos _ _ sourcePos tok =
    let
      pos = tokenPos tok
      size = tokenTypeSize (tokenType tok)
    in
    ( sourcePos
    , pos { sourceColumn = sourceColumn pos <> size }
    )

Primitives for parsing a stream of Tokens.
exact asserts that the next token in the stream has the same TokenType as its argument.
identifier asserts that the next token in the stream is an Identifier, and then returns the identifier string if it succeeds.

type ProgramParser = Parsec Dec TokenStream

exact :: TokenType -> ProgramParser ()
exact t = Prim.token isToken Nothing
  where
    isToken t'
      | t == tokenType t' = Right ()
      | otherwise = Left
        ( S.singleton . Error.Tokens $ pure t'
        , S.singleton . Error.Tokens $ pure t' { tokenType = t }
        , S.empty
        )

identifier :: ProgramParser String
identifier = Prim.token isIdent Nothing
  where
    isIdent t =
      case tokenType t of
        (Identifier str) -> Right str
        _ -> Left
          ( S.singleton . Error.Label $ N.fromList "Identifier"
          , S.empty
          , S.empty
          )

Program AST:

data Program
  = Program [Block]
  deriving (Eq, Show)

data Block
  = Block [Block]
  | Field String String
  deriving (Eq, Show)

Program grammar:

semicolon :: ProgramParser ()
semicolon = exact Semicolon

begin :: ProgramParser ()
begin = exact Begin

end :: ProgramParser ()
end = exact End *> semicolon

endSubsection :: ProgramParser ()
endSubsection =
  exact End *>
  exact Subsection *>
  semicolon

field :: ProgramParser Block
field = Field <$> identifier <*> identifier

block :: ProgramParser Block
block =
  Block <$> between begin endSubsection (some block) <|>
  field

program :: ProgramParser Program
program =
  (Program <$> between begin end (some block)) <*
  exact Eof
	\begin{code}

	{-# LANGUAGE TypeFamilies #-}
	module Parser where

	import Control.Applicative ((*>), (<\|>))
	import Control.Monad (void)
	import Data.Functor (($>))
	import Data.List.NonEmpty (NonEmpty(..))
	import Data.Maybe (fromJust)
	import Data.Semigroup ((<>))
	import Text.Megaparsec (Dec, Parsec, Pos, ShowToken(..),
	SourcePos(..), alphaNumChar, between, char,
	choice, eof, lookAhead, mkPos, skipMany, skipSome, some,
	spaceChar, statePos, string, try)
	import Text.Megaparsec.Prim (MonadParsec)
	import Text.Megaparsec.String (Parser)

	import qualified Text.Megaparsec as Error (ErrorItem(..))
	import qualified Text.Megaparsec.Prim as Prim (Stream(..), getParserState, token)
	import qualified Data.List.NonEmpty as N
	import qualified Data.Set as S
	import qualified Text.Megaparsec.Lexer as L

	\end{code}


	A `Token` has a source position and a type.


	\begin{code}

	data Token
	= Token
	{ tokenPos :: SourcePos
	, tokenType :: TokenType
	} deriving (Eq, Ord, Show)

	\end{code}


	`Token` types. End of file has its own token because we will need to express when
	a valid program is allowed to end.


	\begin{code}

	data TokenType
	= Begin
	\| End
	\| Subsection
	\| Semicolon
	\| Identifier String
	\| Eof
	deriving (Eq, Ord, Show)

	\end{code}


	Each type of token has a size, which represents the size of the word it was parsed from.
	This will be used later to generate nice error messages.


	\begin{code}

	tokenTypeSize :: TokenType -> Pos
	tokenTypeSize t =
	fromJust . mkPos $ case t of
	Begin -> length "begin"
	End -> length "end"
	Subsection -> length "subsection"
	Semicolon -> length ";"
	Identifier str -> length str
	Eof -> 1

	\end{code}


	Parser to skip whitespace and comments:


	\begin{code}

	skipWhitespace :: Parser ()
	skipWhitespace =
	L.space
	(void spaceChar)
	(L.skipLineComment "//")
	(L.skipBlockComment "/" "/")

	\end{code}


	Combinator to easily add the current source position to a `Token`:


	\begin{code}

	withPos
	:: MonadParsec e s m
	=> (SourcePos -> m a)
	-> m a
	withPos f = (N.head . statePos <$> Prim.getParserState) >>= f

	\end{code}


	Combinators that skip whitespace after tokens are parsed.

	`delimiter` is a parser for characters that can follow a keyword.

	`keyword` will only parse its argument if it's followed by a `delimiter`. This allows
	`Identifier`s to contain keywords, but still be parsed correctly.


	\begin{code}

	delimiter :: Parser ()
	delimiter = choice [void $ char ';', void spaceChar, eof]

	lexeme :: Parser a -> Parser a
	lexeme = L.lexeme skipWhitespace

	keyword :: String -> Parser String
	keyword p = lexeme (try $ string p <* lookAhead delimiter)

	symbol :: String -> Parser String
	symbol = L.symbol skipWhitespace

	\end{code}


	Parsers for each `Token` constructor:


	\begin{code}

	beginToken = withPos $ \pos -> Token pos <$>
	(keyword "begin" $> Begin)

	endToken = withPos $ \pos -> Token pos <$>
	(keyword "end" $> End)

	subsectionToken = withPos $ \pos -> Token pos <$>
	(keyword "subsection" $> Subsection)

	semicolonToken = withPos $ \pos -> Token pos <$>
	(symbol ";" $> Semicolon)

	identifierToken = withPos $ \pos -> Token pos .
	Identifier <$> lexeme (some alphaNumChar)

	eofToken = withPos $ \pos -> Token pos <$>
	(eof $> Eof)

	\end{code}


	Parser for a single `Token`.

	`Eof` is not in this list because it should only ever be parsed once. If we added it
	to the list then the parser would loop forever, continually parsing `Eof` tokens.


	\begin{code}

	token :: Parser Token
	token = choice
	[ beginToken
	, endToken
	, subsectionToken
	, semicolonToken
	, identifierToken
	]

	\end{code}


	Parser for a sequence of `Token`s.

	* Skip any whitespace at the beginning of the input
	* Parse as `token`s as possible
	* Get the `Eof` token and its position
	* Return the full sequence


	\begin{code}

	tokens :: Parser [Token]
	tokens = do
	skipWhitespace
	toks <- some token
	end <- eofToken
	pure $ toks ++ [end]

	\end{code}


	That's the lexing phase done. Now some setup for parsing.

	Pretty printing for `Token`s. This allows `parseErrorPretty` to generate nice errors.


	\begin{code}

	instance ShowToken Token where
	showTokens ts = foldMap (prettyToken . tokenType) $ N.toList ts
	where
	quoted s = "\'" ++ s ++ "\'"

	prettyToken :: TokenType -> String
	prettyToken t =
	case t of
	Begin -> quoted "begin"
	End -> quoted "end"
	Subsection -> quoted "subsection"
	Semicolon -> quoted ";"
	Identifier s -> "identifier " ++ quoted s
	Eof -> "end of file"

	\end{code}


	Setup that tells the parser how to consume a stream of our tokens.
	`updatePos` is important because it allows for helpful reporting of error messages.


	\begin{code}

	newtype TokenStream = TokenStream { getTokenStream :: [Token] }

	instance Prim.Stream TokenStream where
	type Token TokenStream = Token

	uncons (TokenStream toks) =
	case toks of
	[] -> Nothing
	(t:ts) -> Just (t, TokenStream ts)

	updatePos _ _ sourcePos tok =
	let
	pos = tokenPos tok
	size = tokenTypeSize (tokenType tok)
	in
	( sourcePos
	, pos { sourceColumn = sourceColumn pos <> size }
	)

	\end{code}


	Primitives for parsing a stream of `Token`s.

	`exact` asserts that the next token in the stream has the same `TokenType` as its
	argument.

	`identifier` asserts that the next token in the stream is an `Identifier`, and then
	returns the identifier string if it succeeds.


	\begin{code}

	type ProgramParser = Parsec Dec TokenStream

	exact :: TokenType -> ProgramParser ()
	exact t = Prim.token isToken Nothing
	where
	isToken t'
	\| t == tokenType t' = Right ()
	\| otherwise = Left
	( S.singleton . Error.Tokens $ pure t'
	, S.singleton . Error.Tokens $ pure t' { tokenType = t }
	, S.empty
	)

	identifier :: ProgramParser String
	identifier = Prim.token isIdent Nothing
	where
	isIdent t =
	case tokenType t of
	(Identifier str) -> Right str
	_ -> Left
	( S.singleton . Error.Label $ N.fromList "Identifier"
	, S.empty
	, S.empty
	)

	\end{code}


	Program AST:


	\begin{code}

	data Program
	= Program [Block]
	deriving (Eq, Show)

	data Block
	= Block [Block]
	\| Field String String
	deriving (Eq, Show)

	\end{code}


	Program grammar:


	\begin{code}

	semicolon :: ProgramParser ()
	semicolon = exact Semicolon

	begin :: ProgramParser ()
	begin = exact Begin

	end :: ProgramParser ()
	end = exact End *> semicolon

	endSubsection :: ProgramParser ()
	endSubsection =
	exact End *>
	exact Subsection *>
	semicolon

	field :: ProgramParser Block
	field = Field <$> identifier <*> identifier

	block :: ProgramParser Block
	block =
	Block <$> between begin endSubsection (some block) <\|>
	field

	program :: ProgramParser Program
	program =
	(Program <$> between begin end (some block)) <*
	exact Eof

	\end{code}