{-# LANGUAGE TypeFamilies #-}
module Parser where
import Control.Applicative ((*>), (<|>))
import Control.Monad (void)
import Data.Functor (($>))
import Data.List.NonEmpty (NonEmpty(..))
import Data.Maybe (fromJust)
import Data.Semigroup ((<>))
import Text.Megaparsec (Dec, Parsec, Pos, ShowToken(..),
SourcePos(..), alphaNumChar, between, char,
choice, eof, lookAhead, mkPos, skipMany, skipSome, some,
spaceChar, statePos, string, try)
import Text.Megaparsec.Prim (MonadParsec)
import Text.Megaparsec.String (Parser)
import qualified Text.Megaparsec as Error (ErrorItem(..))
import qualified Text.Megaparsec.Prim as Prim (Stream(..), getParserState, token)
import qualified Data.List.NonEmpty as N
import qualified Data.Set as S
import qualified Text.Megaparsec.Lexer as L
A Token
has a source position and a type.
data Token
= Token
{ tokenPos :: SourcePos
, tokenType :: TokenType
} deriving (Eq, Ord, Show)
Token
types. End of file has its own token because we will need to express when a valid program is allowed to end.
data TokenType
= Begin
| End
| Subsection
| Semicolon
| Identifier String
| Eof
deriving (Eq, Ord, Show)
Each type of token has a size, which represents the size of the word it was parsed from. This will be used later to generate nice error messages.
tokenTypeSize :: TokenType -> Pos
tokenTypeSize t =
fromJust . mkPos $ case t of
Begin -> length "begin"
End -> length "end"
Subsection -> length "subsection"
Semicolon -> length ";"
Identifier str -> length str
Eof -> 1
Parser to skip whitespace and comments:
skipWhitespace :: Parser ()
skipWhitespace =
L.space
(void spaceChar)
(L.skipLineComment "//")
(L.skipBlockComment "/*" "*/")
Combinator to easily add the current source position to a Token
:
withPos
:: MonadParsec e s m
=> (SourcePos -> m a)
-> m a
withPos f = (N.head . statePos <$> Prim.getParserState) >>= f
Combinators that skip whitespace after tokens are parsed.
delimiter
is a parser for characters that can follow a keyword.
keyword
will only parse its argument if it's followed by a delimiter
. This allows Identifier
s to contain keywords, but still be parsed correctly.
delimiter :: Parser ()
delimiter = choice [void $ char ';', void spaceChar, eof]
lexeme :: Parser a -> Parser a
lexeme = L.lexeme skipWhitespace
keyword :: String -> Parser String
keyword p = lexeme (try $ string p <* lookAhead delimiter)
symbol :: String -> Parser String
symbol = L.symbol skipWhitespace
Parsers for each Token
constructor:
beginToken = withPos $ \pos -> Token pos <$>
(keyword "begin" $> Begin)
endToken = withPos $ \pos -> Token pos <$>
(keyword "end" $> End)
subsectionToken = withPos $ \pos -> Token pos <$>
(keyword "subsection" $> Subsection)
semicolonToken = withPos $ \pos -> Token pos <$>
(symbol ";" $> Semicolon)
identifierToken = withPos $ \pos -> Token pos .
Identifier <$> lexeme (some alphaNumChar)
eofToken = withPos $ \pos -> Token pos <$>
(eof $> Eof)
Parser for a single Token
.
Eof
is not in this list because it should only ever be parsed once. If we added it to the list then the parser would loop forever, continually parsing Eof
tokens.
token :: Parser Token
token = choice
[ beginToken
, endToken
, subsectionToken
, semicolonToken
, identifierToken
]
Parser for a sequence of Token
s.
- Skip any whitespace at the beginning of the input
- Parse as
token
s as possible - Get the
Eof
token and its position - Return the full sequence
tokens :: Parser [Token]
tokens = do
skipWhitespace
toks <- some token
end <- eofToken
pure $ toks ++ [end]
That's the lexing phase done. Now some setup for parsing.
Pretty printing for Token
s. This allows parseErrorPretty
to generate nice errors.
instance ShowToken Token where
showTokens ts = foldMap (prettyToken . tokenType) $ N.toList ts
where
quoted s = "\'" ++ s ++ "\'"
prettyToken :: TokenType -> String
prettyToken t =
case t of
Begin -> quoted "begin"
End -> quoted "end"
Subsection -> quoted "subsection"
Semicolon -> quoted ";"
Identifier s -> "identifier " ++ quoted s
Eof -> "end of file"
Setup that tells the parser how to consume a stream of our tokens. updatePos
is important because it allows for helpful reporting of error messages.
newtype TokenStream = TokenStream { getTokenStream :: [Token] }
instance Prim.Stream TokenStream where
type Token TokenStream = Token
uncons (TokenStream toks) =
case toks of
[] -> Nothing
(t:ts) -> Just (t, TokenStream ts)
updatePos _ _ sourcePos tok =
let
pos = tokenPos tok
size = tokenTypeSize (tokenType tok)
in
( sourcePos
, pos { sourceColumn = sourceColumn pos <> size }
)
Primitives for parsing a stream of Token
s.
exact
asserts that the next token in the stream has the same TokenType
as its argument.
identifier
asserts that the next token in the stream is an Identifier
, and then returns the identifier string if it succeeds.
type ProgramParser = Parsec Dec TokenStream
exact :: TokenType -> ProgramParser ()
exact t = Prim.token isToken Nothing
where
isToken t'
| t == tokenType t' = Right ()
| otherwise = Left
( S.singleton . Error.Tokens $ pure t'
, S.singleton . Error.Tokens $ pure t' { tokenType = t }
, S.empty
)
identifier :: ProgramParser String
identifier = Prim.token isIdent Nothing
where
isIdent t =
case tokenType t of
(Identifier str) -> Right str
_ -> Left
( S.singleton . Error.Label $ N.fromList "Identifier"
, S.empty
, S.empty
)
Program AST:
data Program
= Program [Block]
deriving (Eq, Show)
data Block
= Block [Block]
| Field String String
deriving (Eq, Show)
Program grammar:
semicolon :: ProgramParser ()
semicolon = exact Semicolon
begin :: ProgramParser ()
begin = exact Begin
end :: ProgramParser ()
end = exact End *> semicolon
endSubsection :: ProgramParser ()
endSubsection =
exact End *>
exact Subsection *>
semicolon
field :: ProgramParser Block
field = Field <$> identifier <*> identifier
block :: ProgramParser Block
block =
Block <$> between begin endSubsection (some block) <|>
field
program :: ProgramParser Program
program =
(Program <$> between begin end (some block)) <*
exact Eof