Skip to content

Instantly share code, notes, and snippets.

@LightAndLight
Last active November 21, 2023 15:26
Show Gist options
  • Save LightAndLight/36926a4e3a7133910d9aa199da50c4fd to your computer and use it in GitHub Desktop.
Save LightAndLight/36926a4e3a7133910d9aa199da50c4fd to your computer and use it in GitHub Desktop.
Lexing and parsing in Megaparsec
\begin{code}
{-# LANGUAGE TypeFamilies #-}
module Parser where
import Control.Applicative ((*>), (<|>))
import Control.Monad (void)
import Data.Functor (($>))
import Data.List.NonEmpty (NonEmpty(..))
import Data.Maybe (fromJust)
import Data.Semigroup ((<>))
import Text.Megaparsec (Dec, Parsec, Pos, ShowToken(..),
SourcePos(..), alphaNumChar, between, char,
choice, eof, lookAhead, mkPos, skipMany, skipSome, some,
spaceChar, statePos, string, try)
import Text.Megaparsec.Prim (MonadParsec)
import Text.Megaparsec.String (Parser)
import qualified Text.Megaparsec as Error (ErrorItem(..))
import qualified Text.Megaparsec.Prim as Prim (Stream(..), getParserState, token)
import qualified Data.List.NonEmpty as N
import qualified Data.Set as S
import qualified Text.Megaparsec.Lexer as L
\end{code}
A `Token` has a source position and a type.
\begin{code}
data Token
= Token
{ tokenPos :: SourcePos
, tokenType :: TokenType
} deriving (Eq, Ord, Show)
\end{code}
`Token` types. End of file has its own token because we will need to express when
a valid program is allowed to end.
\begin{code}
data TokenType
= Begin
| End
| Subsection
| Semicolon
| Identifier String
| Eof
deriving (Eq, Ord, Show)
\end{code}
Each type of token has a size, which represents the size of the word it was parsed from.
This will be used later to generate nice error messages.
\begin{code}
tokenTypeSize :: TokenType -> Pos
tokenTypeSize t =
fromJust . mkPos $ case t of
Begin -> length "begin"
End -> length "end"
Subsection -> length "subsection"
Semicolon -> length ";"
Identifier str -> length str
Eof -> 1
\end{code}
Parser to skip whitespace and comments:
\begin{code}
skipWhitespace :: Parser ()
skipWhitespace =
L.space
(void spaceChar)
(L.skipLineComment "//")
(L.skipBlockComment "/*" "*/")
\end{code}
Combinator to easily add the current source position to a `Token`:
\begin{code}
withPos
:: MonadParsec e s m
=> (SourcePos -> m a)
-> m a
withPos f = (N.head . statePos <$> Prim.getParserState) >>= f
\end{code}
Combinators that skip whitespace after tokens are parsed.
`delimiter` is a parser for characters that can follow a keyword.
`keyword` will only parse its argument if it's followed by a `delimiter`. This allows
`Identifier`s to contain keywords, but still be parsed correctly.
\begin{code}
delimiter :: Parser ()
delimiter = choice [void $ char ';', void spaceChar, eof]
lexeme :: Parser a -> Parser a
lexeme = L.lexeme skipWhitespace
keyword :: String -> Parser String
keyword p = lexeme (try $ string p <* lookAhead delimiter)
symbol :: String -> Parser String
symbol = L.symbol skipWhitespace
\end{code}
Parsers for each `Token` constructor:
\begin{code}
beginToken = withPos $ \pos -> Token pos <$>
(keyword "begin" $> Begin)
endToken = withPos $ \pos -> Token pos <$>
(keyword "end" $> End)
subsectionToken = withPos $ \pos -> Token pos <$>
(keyword "subsection" $> Subsection)
semicolonToken = withPos $ \pos -> Token pos <$>
(symbol ";" $> Semicolon)
identifierToken = withPos $ \pos -> Token pos .
Identifier <$> lexeme (some alphaNumChar)
eofToken = withPos $ \pos -> Token pos <$>
(eof $> Eof)
\end{code}
Parser for a single `Token`.
`Eof` is not in this list because it should only ever be parsed once. If we added it
to the list then the parser would loop forever, continually parsing `Eof` tokens.
\begin{code}
token :: Parser Token
token = choice
[ beginToken
, endToken
, subsectionToken
, semicolonToken
, identifierToken
]
\end{code}
Parser for a sequence of `Token`s.
* Skip any whitespace at the beginning of the input
* Parse as `token`s as possible
* Get the `Eof` token and its position
* Return the full sequence
\begin{code}
tokens :: Parser [Token]
tokens = do
skipWhitespace
toks <- some token
end <- eofToken
pure $ toks ++ [end]
\end{code}
That's the lexing phase done. Now some setup for parsing.
Pretty printing for `Token`s. This allows `parseErrorPretty` to generate nice errors.
\begin{code}
instance ShowToken Token where
showTokens ts = foldMap (prettyToken . tokenType) $ N.toList ts
where
quoted s = "\'" ++ s ++ "\'"
prettyToken :: TokenType -> String
prettyToken t =
case t of
Begin -> quoted "begin"
End -> quoted "end"
Subsection -> quoted "subsection"
Semicolon -> quoted ";"
Identifier s -> "identifier " ++ quoted s
Eof -> "end of file"
\end{code}
Setup that tells the parser how to consume a stream of our tokens.
`updatePos` is important because it allows for helpful reporting of error messages.
\begin{code}
newtype TokenStream = TokenStream { getTokenStream :: [Token] }
instance Prim.Stream TokenStream where
type Token TokenStream = Token
uncons (TokenStream toks) =
case toks of
[] -> Nothing
(t:ts) -> Just (t, TokenStream ts)
updatePos _ _ sourcePos tok =
let
pos = tokenPos tok
size = tokenTypeSize (tokenType tok)
in
( sourcePos
, pos { sourceColumn = sourceColumn pos <> size }
)
\end{code}
Primitives for parsing a stream of `Token`s.
`exact` asserts that the next token in the stream has the same `TokenType` as its
argument.
`identifier` asserts that the next token in the stream is an `Identifier`, and then
returns the identifier string if it succeeds.
\begin{code}
type ProgramParser = Parsec Dec TokenStream
exact :: TokenType -> ProgramParser ()
exact t = Prim.token isToken Nothing
where
isToken t'
| t == tokenType t' = Right ()
| otherwise = Left
( S.singleton . Error.Tokens $ pure t'
, S.singleton . Error.Tokens $ pure t' { tokenType = t }
, S.empty
)
identifier :: ProgramParser String
identifier = Prim.token isIdent Nothing
where
isIdent t =
case tokenType t of
(Identifier str) -> Right str
_ -> Left
( S.singleton . Error.Label $ N.fromList "Identifier"
, S.empty
, S.empty
)
\end{code}
Program AST:
\begin{code}
data Program
= Program [Block]
deriving (Eq, Show)
data Block
= Block [Block]
| Field String String
deriving (Eq, Show)
\end{code}
Program grammar:
\begin{code}
semicolon :: ProgramParser ()
semicolon = exact Semicolon
begin :: ProgramParser ()
begin = exact Begin
end :: ProgramParser ()
end = exact End *> semicolon
endSubsection :: ProgramParser ()
endSubsection =
exact End *>
exact Subsection *>
semicolon
field :: ProgramParser Block
field = Field <$> identifier <*> identifier
block :: ProgramParser Block
block =
Block <$> between begin endSubsection (some block) <|>
field
program :: ProgramParser Program
program =
(Program <$> between begin end (some block)) <*
exact Eof
\end{code}

{-# LANGUAGE TypeFamilies #-}
module Parser where

import Control.Applicative    ((*>), (<|>))
import Control.Monad          (void)
import Data.Functor           (($>))
import Data.List.NonEmpty     (NonEmpty(..))
import Data.Maybe             (fromJust)
import Data.Semigroup         ((<>))
import Text.Megaparsec        (Dec, Parsec, Pos, ShowToken(..),
                               SourcePos(..), alphaNumChar, between, char,
                               choice, eof, lookAhead, mkPos, skipMany, skipSome, some,
                               spaceChar, statePos, string, try)
import Text.Megaparsec.Prim   (MonadParsec)
import Text.Megaparsec.String (Parser)

import qualified Text.Megaparsec       as Error (ErrorItem(..))
import qualified Text.Megaparsec.Prim  as Prim  (Stream(..), getParserState, token)
import qualified Data.List.NonEmpty    as N
import qualified Data.Set              as S
import qualified Text.Megaparsec.Lexer as L

A Token has a source position and a type.


data Token
  = Token
  { tokenPos :: SourcePos
  , tokenType  :: TokenType
  } deriving (Eq, Ord, Show)

Token types. End of file has its own token because we will need to express when a valid program is allowed to end.


data TokenType
  = Begin
  | End
  | Subsection
  | Semicolon
  | Identifier String
  | Eof
  deriving (Eq, Ord, Show)

Each type of token has a size, which represents the size of the word it was parsed from. This will be used later to generate nice error messages.


tokenTypeSize :: TokenType -> Pos
tokenTypeSize t =
  fromJust . mkPos $ case t of
    Begin -> length "begin"
    End -> length "end"
    Subsection -> length "subsection"
    Semicolon -> length ";"
    Identifier str -> length str
    Eof -> 1

Parser to skip whitespace and comments:


skipWhitespace :: Parser ()
skipWhitespace =
  L.space
    (void spaceChar)
    (L.skipLineComment "//")
    (L.skipBlockComment "/*" "*/")

Combinator to easily add the current source position to a Token:


withPos
  :: MonadParsec e s m
  => (SourcePos -> m a)
  -> m a
withPos f = (N.head . statePos <$> Prim.getParserState) >>= f

Combinators that skip whitespace after tokens are parsed.

delimiter is a parser for characters that can follow a keyword.

keyword will only parse its argument if it's followed by a delimiter. This allows Identifiers to contain keywords, but still be parsed correctly.


delimiter :: Parser ()
delimiter = choice [void $ char ';', void spaceChar, eof]

lexeme :: Parser a -> Parser a
lexeme = L.lexeme skipWhitespace

keyword :: String -> Parser String
keyword p = lexeme (try $ string p <* lookAhead delimiter)

symbol :: String -> Parser String
symbol = L.symbol skipWhitespace

Parsers for each Token constructor:


beginToken = withPos $ \pos -> Token pos <$>
  (keyword "begin" $> Begin)

endToken = withPos $ \pos -> Token pos <$>
  (keyword "end" $> End)

subsectionToken = withPos $ \pos -> Token pos <$>
  (keyword "subsection" $> Subsection)

semicolonToken = withPos $ \pos -> Token pos <$>
  (symbol ";" $> Semicolon)

identifierToken = withPos $ \pos -> Token pos .
  Identifier <$> lexeme (some alphaNumChar)

eofToken = withPos $ \pos -> Token pos <$>
  (eof $> Eof)

Parser for a single Token.

Eof is not in this list because it should only ever be parsed once. If we added it to the list then the parser would loop forever, continually parsing Eof tokens.


token :: Parser Token
token = choice
  [ beginToken
  , endToken
  , subsectionToken
  , semicolonToken
  , identifierToken
  ]

Parser for a sequence of Tokens.

  • Skip any whitespace at the beginning of the input
  • Parse as tokens as possible
  • Get the Eof token and its position
  • Return the full sequence

tokens :: Parser [Token]
tokens = do
  skipWhitespace
  toks <- some token
  end <- eofToken
  pure $ toks ++ [end]

That's the lexing phase done. Now some setup for parsing.

Pretty printing for Tokens. This allows parseErrorPretty to generate nice errors.


instance ShowToken Token where
  showTokens ts = foldMap (prettyToken . tokenType) $ N.toList ts
    where
      quoted s = "\'" ++ s ++ "\'"

      prettyToken :: TokenType -> String
      prettyToken t =
        case t of
          Begin -> quoted "begin"
          End -> quoted "end"
          Subsection -> quoted "subsection"
          Semicolon -> quoted ";"
          Identifier s -> "identifier " ++ quoted s
          Eof -> "end of file"

Setup that tells the parser how to consume a stream of our tokens. updatePos is important because it allows for helpful reporting of error messages.


newtype TokenStream = TokenStream { getTokenStream :: [Token] }

instance Prim.Stream TokenStream where
  type Token TokenStream = Token

  uncons (TokenStream toks) =
    case toks of
      [] -> Nothing
      (t:ts) -> Just (t, TokenStream ts)

  updatePos _ _ sourcePos tok =
    let
      pos = tokenPos tok
      size = tokenTypeSize (tokenType tok)
    in
    ( sourcePos
    , pos { sourceColumn = sourceColumn pos <> size }
    )

Primitives for parsing a stream of Tokens.

exact asserts that the next token in the stream has the same TokenType as its argument.

identifier asserts that the next token in the stream is an Identifier, and then returns the identifier string if it succeeds.


type ProgramParser = Parsec Dec TokenStream

exact :: TokenType -> ProgramParser ()
exact t = Prim.token isToken Nothing
  where
    isToken t'
      | t == tokenType t' = Right ()
      | otherwise = Left
        ( S.singleton . Error.Tokens $ pure t'
        , S.singleton . Error.Tokens $ pure t' { tokenType = t }
        , S.empty
        )

identifier :: ProgramParser String
identifier = Prim.token isIdent Nothing
  where
    isIdent t =
      case tokenType t of
        (Identifier str) -> Right str
        _ -> Left
          ( S.singleton . Error.Label $ N.fromList "Identifier"
          , S.empty
          , S.empty
          )

Program AST:


data Program
  = Program [Block]
  deriving (Eq, Show)

data Block
  = Block [Block]
  | Field String String
  deriving (Eq, Show)

Program grammar:


semicolon :: ProgramParser ()
semicolon = exact Semicolon

begin :: ProgramParser ()
begin = exact Begin

end :: ProgramParser ()
end = exact End *> semicolon

endSubsection :: ProgramParser ()
endSubsection =
  exact End *>
  exact Subsection *>
  semicolon

field :: ProgramParser Block
field = Field <$> identifier <*> identifier

block :: ProgramParser Block
block =
  Block <$> between begin endSubsection (some block) <|>
  field

program :: ProgramParser Program
program =
  (Program <$> between begin end (some block)) <*
  exact Eof
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment