-
-
Save heitor-lassarote/9cc4203990352f5df5f6b84da9291df9 to your computer and use it in GitHub Desktop.
Lexer.x and Parser.y: Solutions to exercises 1, 2, and 3
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
module Lexer | |
( -- * Invoking Alex | |
Alex | |
, AlexPosn (..) | |
, alexGetInput | |
, alexError | |
, runAlex | |
, alexMonadScan | |
, Range (..) | |
, RangedToken (..) | |
, Token (..) | |
, scanMany | |
) where | |
import Control.Monad (when) | |
import Data.ByteString.Lazy.Char8 (ByteString) | |
import qualified Data.ByteString.Lazy.Char8 as BS | |
import Data.Scientific (Scientific) | |
} | |
%wrapper "monadUserState-bytestring" | |
$digit = [0-9] | |
$alpha = [a-zA-Z] | |
@id = ($alpha | \_) ($alpha | $digit | \_ | \' | \?)* | |
@exponent = e (\- | \+)? $digit+ | |
tokens :- | |
<0> $white+ ; | |
<0> "(*" { nestComment `andBegin` comment } | |
<0> "*)" { \_ _ -> alexError "Error: unexpected closing comment" } | |
<comment> "(*" { nestComment } | |
<comment> "*)" { unnestComment } | |
<comment> . ; | |
<comment> \n ; | |
-- Note that . doesn't match newlines, which is what we want. | |
<0> "#" .* ; | |
-- Keywords | |
<0> let { tok Let } | |
<0> in { tok In } | |
<0> if { tok If } | |
<0> then { tok Then } | |
<0> else { tok Else } | |
-- Arithmetic operators | |
<0> "+" { tok Plus } | |
<0> "-" { tok Minus } | |
<0> "*" { tok Times } | |
<0> "/" { tok Divide } | |
-- Comparison operators | |
<0> "=" { tok Eq } | |
<0> "<>" { tok Neq } | |
<0> "<" { tok Lt } | |
<0> "<=" { tok Le } | |
<0> ">" { tok Gt } | |
<0> ">=" { tok Ge } | |
-- Logical operators | |
<0> "&" { tok And } | |
<0> "|" { tok Or } | |
-- Parenthesis | |
<0> "(" { tok LPar } | |
<0> ")" { tok RPar } | |
-- Lists | |
<0> "[" { tok LBrack } | |
<0> "]" { tok RBrack } | |
<0> "," { tok Comma } | |
-- Types | |
<0> ":" { tok Colon } | |
<0> "->" { tok Arrow } | |
-- Projection | |
<0> \. { tok Dot } | |
-- Exercise 3 bonus: | |
-- Pattern matching | |
<0> match { tok Match } | |
<0> with { tok With } | |
-- Identifiers | |
<0> @id { tokId } | |
-- Constants | |
-- Exercise 1: | |
<0> $digit+ (\. $digit+)? @exponent? { tokNumber } | |
<0> \" { enterString `andBegin` string } | |
<string> \" { exitString `andBegin` 0 } | |
<string> \\\\ { emit '\\' } | |
<string> \\\" { emit '"' } | |
<string> \\n { emit '\n' } | |
<string> \\t { emit '\t' } | |
<string> . { emitCurrent } | |
{ | |
data AlexUserState = AlexUserState | |
{ nestLevel :: Int | |
, strStart :: AlexPosn | |
, strBuffer :: [Char] | |
} | |
alexInitUserState :: AlexUserState | |
alexInitUserState = AlexUserState | |
{ nestLevel = 0 | |
, strStart = AlexPn 0 0 0 | |
, strBuffer = [] | |
} | |
get :: Alex AlexUserState | |
get = Alex $ \s -> Right (s, alex_ust s) | |
put :: AlexUserState -> Alex () | |
put s' = Alex $ \s -> Right (s{alex_ust = s'}, ()) | |
modify :: (AlexUserState -> AlexUserState) -> Alex () | |
modify f = Alex $ \s -> Right (s{alex_ust = f (alex_ust s)}, ()) | |
alexEOF :: Alex RangedToken | |
alexEOF = do | |
(pos, _, _, _) <- alexGetInput | |
startCode <- alexGetStartCode | |
when (startCode == comment) $ | |
alexError "Error: unclosed comment" | |
when (startCode == string) $ | |
alexError "Error: unclosed string" | |
pure $ RangedToken EOF (Range pos pos) | |
data Range = Range | |
{ start :: AlexPosn | |
, stop :: AlexPosn | |
} deriving (Eq, Show) | |
data RangedToken = RangedToken | |
{ rtToken :: Token | |
, rtRange :: Range | |
} deriving (Eq, Show) | |
data Token | |
-- Identifiers | |
= Identifier ByteString | |
-- Constants | |
| String ByteString | |
| Number Scientific | |
-- Keywords | |
| Let | |
| In | |
| If | |
| Then | |
| Else | |
-- Arithmetic operators | |
| Plus | |
| Minus | |
| Times | |
| Divide | |
-- Comparison operators | |
| Eq | |
| Neq | |
| Lt | |
| Le | |
| Gt | |
| Ge | |
-- Logical operators | |
| And | |
| Or | |
-- Parenthesis | |
| LPar | |
| RPar | |
-- Lists | |
| Comma | |
| LBrack | |
| RBrack | |
-- Types | |
| Colon | |
| Arrow | |
| VBar | |
-- Projection | |
| Dot | |
-- Patern matching | |
| Match | |
| With | |
-- EOF | |
| EOF | |
deriving (Eq, Show) | |
mkRange :: AlexInput -> Int64 -> Range | |
mkRange (start, _, str, _) len = Range{start = start, stop = stop} | |
where | |
stop = BS.foldl' alexMove start $ BS.take len str | |
tok :: Token -> AlexAction RangedToken | |
tok ctor inp len = | |
pure RangedToken | |
{ rtToken = ctor | |
, rtRange = mkRange inp len | |
} | |
tokId, tokNumber :: AlexAction RangedToken | |
tokId inp@(_, _, str, _) len = | |
pure RangedToken | |
{ rtToken = Identifier $ BS.take len str | |
, rtRange = mkRange inp len | |
} | |
tokNumber inp@(_ ,_, str, _) len = | |
pure RangedToken | |
{ rtToken = Number $ read $ BS.unpack $ BS.take len str | |
, rtRange = mkRange inp len | |
} | |
enterString, exitString :: AlexAction RangedToken | |
enterString inp@(pos, _, _, _) len = do | |
modify $ \s -> s{strStart = pos, strBuffer = '"' : strBuffer s} | |
skip inp len | |
exitString inp@(pos, _, _, _) len = do | |
s <- get | |
put s{strStart = AlexPn 0 0 0, strBuffer = []} | |
pure RangedToken | |
{ rtToken = String $ BS.pack $ reverse $ '"' : strBuffer s | |
, rtRange = Range (strStart s) (alexMove pos '"') | |
} | |
emit :: Char -> AlexAction RangedToken | |
emit c inp@(_, _, str, _) len = do | |
modify $ \s -> s{strBuffer = c : strBuffer s} | |
skip inp len | |
emitCurrent :: AlexAction RangedToken | |
emitCurrent inp@(_, _, str, _) len = do | |
modify $ \s -> s{strBuffer = BS.head str : strBuffer s} | |
skip inp len | |
nestComment, unnestComment :: AlexAction RangedToken | |
nestComment input len = do | |
modify $ \s -> s{nestLevel = nestLevel s + 1} | |
skip input len | |
unnestComment input len = do | |
state <- get | |
let level = nestLevel state - 1 | |
put state{nestLevel = level} | |
when (level == 0) $ | |
alexSetStartCode 0 | |
skip input len | |
scanMany :: ByteString -> Either String [RangedToken] | |
scanMany input = runAlex input go | |
where | |
go = do | |
output <- alexMonadScan | |
if rtToken output == EOF | |
then pure [output] | |
else (output :) <$> go | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
{-# LANGUAGE DeriveFoldable #-} | |
module Parser | |
( parseMiniML | |
) where | |
import Data.ByteString.Lazy.Char8 (ByteString) | |
import Data.Maybe (fromJust) | |
import Data.Monoid (First (..)) | |
import Data.List.NonEmpty (NonEmpty) | |
import qualified Data.List.NonEmpty as NE | |
import Data.Scientific (Scientific) | |
import qualified Lexer as L | |
} | |
%name parseMiniML decs | |
%tokentype { L.RangedToken } | |
%error { parseError } | |
%monad { L.Alex } { >>= } { pure } | |
%lexer { lexer } { L.RangedToken L.EOF _ } | |
%expect 0 | |
%token | |
-- Identifiers | |
identifier { L.RangedToken (L.Identifier _) _ } | |
-- Constants | |
string { L.RangedToken (L.String _) _ } | |
number { L.RangedToken (L.Number _) _ } | |
-- Keywords | |
let { L.RangedToken L.Let _ } | |
in { L.RangedToken L.In _ } | |
if { L.RangedToken L.If _ } | |
then { L.RangedToken L.Then _ } | |
else { L.RangedToken L.Else _ } | |
-- Arithmetic operators | |
'+' { L.RangedToken L.Plus _ } | |
'-' { L.RangedToken L.Minus _ } | |
'*' { L.RangedToken L.Times _ } | |
'/' { L.RangedToken L.Divide _ } | |
-- Comparison operators | |
'=' { L.RangedToken L.Eq _ } | |
'<>' { L.RangedToken L.Neq _ } | |
'<' { L.RangedToken L.Lt _ } | |
'<=' { L.RangedToken L.Le _ } | |
'>' { L.RangedToken L.Gt _ } | |
'>=' { L.RangedToken L.Ge _ } | |
-- Logical operators | |
'&' { L.RangedToken L.And _ } | |
'|' { L.RangedToken L.Or _ } | |
-- Parenthesis | |
'(' { L.RangedToken L.LPar _ } | |
')' { L.RangedToken L.RPar _ } | |
-- Lists | |
'[' { L.RangedToken L.LBrack _ } | |
']' { L.RangedToken L.RBrack _ } | |
',' { L.RangedToken L.Comma _ } | |
-- Types | |
':' { L.RangedToken L.Colon _ } | |
'->' { L.RangedToken L.Arrow _ } | |
-- Projection | |
'.' { L.RangedToken L.Dot _ } | |
-- Pattern matching | |
match { L.RangedToken L.Match _ } | |
with { L.RangedToken L.With _ } | |
%right else in | |
%right '->' | |
%left '|' | |
%left '&' | |
%nonassoc '=' '<>' '<' '>' '<=' '>=' | |
%left '+' '-' | |
%left '*' '/' | |
%% | |
optional(p) | |
: { Nothing } | |
| p { Just $1 } | |
many_rev(p) | |
: { [] } | |
| many_rev(p) p { $2 : $1 } | |
many(p) | |
: many_rev(p) { reverse $1 } | |
sepBy_rev(p, sep) | |
: { [] } | |
| sepBy_rev(p, sep) sep p { $3 : $1 } | |
sepBy(p, sep) | |
: sepBy_rev(p, sep) { reverse $1 } | |
name :: { Name L.Range } | |
: identifier { unTok $1 (\range (L.Identifier name) -> Name range name) } | |
type :: { Type L.Range } | |
: name { TVar (info $1) $1 } | |
| '(' ')' { TUnit (L.rtRange $1 <-> L.rtRange $2) } | |
| '(' type ')' { TPar (L.rtRange $1 <-> L.rtRange $3) $2 } | |
| '[' type ']' { TList (L.rtRange $1 <-> L.rtRange $3) $2 } | |
| type '->' type { TArrow (info $1 <-> info $3) $1 $3 } | |
typeAnnotation :: { Type L.Range } | |
: ':' type { $2 } | |
dec :: { Dec L.Range } | |
: let name many(pat) optional(typeAnnotation) '=' exp { Dec (L.rtRange $1 <-> info $6) $2 $3 $4 $6 } | |
decs :: { [Dec L.Range] } | |
: many(dec) { $1 } | |
exp :: { Exp L.Range } | |
: expapp { $1 } | |
| expcond { $1 } | |
| '-' exp { ENeg (L.rtRange $1 <-> info $2) $2 } | |
-- Arithmetic operators | |
| exp '+' exp { EBinOp (info $1 <-> info $3) $1 (Plus (L.rtRange $2)) $3 } | |
| exp '-' exp { EBinOp (info $1 <-> info $3) $1 (Minus (L.rtRange $2)) $3 } | |
| exp '*' exp { EBinOp (info $1 <-> info $3) $1 (Times (L.rtRange $2)) $3 } | |
| exp '/' exp { EBinOp (info $1 <-> info $3) $1 (Divide (L.rtRange $2)) $3 } | |
-- Comparison operators | |
| exp '=' exp { EBinOp (info $1 <-> info $3) $1 (Eq (L.rtRange $2)) $3 } | |
| exp '<>' exp { EBinOp (info $1 <-> info $3) $1 (Neq (L.rtRange $2)) $3 } | |
| exp '<' exp { EBinOp (info $1 <-> info $3) $1 (Lt (L.rtRange $2)) $3 } | |
| exp '<=' exp { EBinOp (info $1 <-> info $3) $1 (Le (L.rtRange $2)) $3 } | |
| exp '>' exp { EBinOp (info $1 <-> info $3) $1 (Gt (L.rtRange $2)) $3 } | |
| exp '>=' exp { EBinOp (info $1 <-> info $3) $1 (Ge (L.rtRange $2)) $3 } | |
-- Logical operators | |
| exp '&' exp { EBinOp (info $1 <-> info $3) $1 (And (L.rtRange $2)) $3 } | |
| exp '|' exp { EBinOp (info $1 <-> info $3) $1 (Or (L.rtRange $2)) $3 } | |
| dec in exp { ELetIn (info $1 <-> info $3) $1 $3 } | |
-- Exercise 3 bonus: | |
-- Note: Suppose we have this: | |
-- match 0 with | 0 -> match 1 with | 1 -> 1 | 2 -> 2 | |
-- Shifting will cause it to parse like so: | |
-- match 0 with | 0 -> (match 1 with | 1 -> 1 | 2 -> 2) | |
-- Reducing would give: | |
-- (match 0 with | 0 -> match 1 with | 1 -> 1) | 2 -> 2 | |
| match exp with many_rev(case) %shift | |
{ | |
let | |
cases = reverse $4 | |
endPos = maybe (L.rtRange $3) (info . NE.last) (NE.nonEmpty cases) | |
in | |
EMatch (L.rtRange $1 <-> endPos) $2 cases | |
} | |
case :: { Case L.Range } | |
: '|' pat '->' exp { Case (L.rtRange $1 <-> info $4) $2 $4 } | |
expapp :: { Exp L.Range } | |
: expapp projatom { EApp (info $1 <-> info $2) $1 $2 } | |
| projatom { $1 } | |
expcond :: { Exp L.Range } | |
: if exp then exp %shift { EIfThen (L.rtRange $1 <-> info $4) $2 $4 } | |
| if exp then exp else exp { EIfThenElse (L.rtRange $1 <-> info $6) $2 $4 $6 } | |
-- Exercise 2: | |
projatom :: { Exp L.Range } | |
: atom many(projection) | |
{ maybe $1 (\projs -> EProj (info $1 <-> info (NE.last projs)) $1 projs) (NE.nonEmpty $2) } | |
atom :: { Exp L.Range } | |
: number { unTok $1 (\range (L.Number num) -> ENumber range num) } | |
| name { EVar (info $1) $1 } | |
| string { unTok $1 (\range (L.String string) -> EString range string) } | |
| '(' ')' { EUnit (L.rtRange $1 <-> L.rtRange $2) } | |
| '[' sepBy(exp, ',') ']' { EList (L.rtRange $1 <-> L.rtRange $3) $2 } | |
| '(' exp ')' { EPar (L.rtRange $1 <-> L.rtRange $3) $2 } | |
-- Arithmetic operators | |
| '(' '+' ')' { EOp (L.rtRange $1 <-> L.rtRange $3) (Plus (L.rtRange $2)) } | |
| '(' '-' ')' { EOp (L.rtRange $1 <-> L.rtRange $3) (Minus (L.rtRange $2)) } | |
| '(' '*' ')' { EOp (L.rtRange $1 <-> L.rtRange $3) (Times (L.rtRange $2)) } | |
| '(' '/' ')' { EOp (L.rtRange $1 <-> L.rtRange $3) (Divide (L.rtRange $2)) } | |
-- Comparison operators | |
| '(' '=' ')' { EOp (L.rtRange $1 <-> L.rtRange $3) (Eq (L.rtRange $2)) } | |
| '(' '<>' ')' { EOp (L.rtRange $1 <-> L.rtRange $3) (Neq (L.rtRange $2)) } | |
| '(' '<' ')' { EOp (L.rtRange $1 <-> L.rtRange $3) (Lt (L.rtRange $2)) } | |
| '(' '<=' ')' { EOp (L.rtRange $1 <-> L.rtRange $3) (Le (L.rtRange $2)) } | |
| '(' '>' ')' { EOp (L.rtRange $1 <-> L.rtRange $3) (Gt (L.rtRange $2)) } | |
| '(' '>=' ')' { EOp (L.rtRange $1 <-> L.rtRange $3) (Ge (L.rtRange $2)) } | |
-- Logical operators | |
| '(' '&' ')' { EOp (L.rtRange $1 <-> L.rtRange $3) (And (L.rtRange $2)) } | |
| '(' '|' ')' { EOp (L.rtRange $1 <-> L.rtRange $3) (Or (L.rtRange $2)) } | |
-- Exercise 3: | |
pat :: { Pat L.Range } | |
: number { unTok $1 (\range (L.Number num) -> PNumber range num) } | |
| name { PVar (info $1) $1 } | |
| string { unTok $1 (\range (L.String string) -> PString range string) } | |
| '(' ')' { PUnit (L.rtRange $1 <-> L.rtRange $2) } | |
| '(' pat ')' { PPar (L.rtRange $1 <-> L.rtRange $3) $2 } | |
| '(' pat typeAnnotation ')' { PAnnot (L.rtRange $1 <-> L.rtRange $4) $2 $3 } | |
| '[' sepBy(pat, ',') ']' { PList (L.rtRange $1 <-> L.rtRange $3) $2 } | |
projection :: { Proj L.Range } | |
: '.' '(' exp ')' { ProjList (L.rtRange $1 <-> L.rtRange $4) $3 } | |
{ | |
parseError :: L.RangedToken -> L.Alex a | |
parseError _ = do | |
(L.AlexPn _ line column, _, _, _) <- L.alexGetInput | |
L.alexError $ "Parse error at line " <> show line <> ", column " <> show column | |
lexer :: (L.RangedToken -> L.Alex a) -> L.Alex a | |
lexer = (=<< L.alexMonadScan) | |
-- | Build a simple node by extracting its token type and range. | |
unTok :: L.RangedToken -> (L.Range -> L.Token -> a) -> a | |
unTok (L.RangedToken tok range) ctor = ctor range tok | |
-- | Unsafely extracts the the metainformation field of a node. | |
info :: Foldable f => f a -> a | |
info = fromJust . getFirst . foldMap pure | |
-- | Performs the union of two ranges by creating a new range starting at the | |
-- start position of the first range, and stopping at the stop position of the | |
-- second range. | |
-- Invariant: The LHS range starts before the RHS range. | |
(<->) :: L.Range -> L.Range -> L.Range | |
L.Range a1 _ <-> L.Range _ b2 = L.Range a1 b2 | |
-- * AST | |
data Name a | |
= Name a ByteString | |
deriving (Foldable, Show) | |
data Type a | |
= TVar a (Name a) | |
| TPar a (Type a) | |
| TUnit a | |
| TList a (Type a) | |
| TArrow a (Type a) (Type a) | |
deriving (Foldable, Show) | |
data Dec a | |
= Dec a (Name a) [Pat a] (Maybe (Type a)) (Exp a) | |
deriving (Foldable, Show) | |
data Operator a | |
= Plus a | |
| Minus a | |
| Times a | |
| Divide a | |
| Eq a | |
| Neq a | |
| Lt a | |
| Le a | |
| Gt a | |
| Ge a | |
| And a | |
| Or a | |
deriving (Foldable, Show) | |
data Proj a | |
= ProjList a (Exp a) | |
deriving (Foldable, Show) | |
data Case a | |
= Case a (Pat a) (Exp a) | |
deriving (Foldable, Show) | |
data Exp a | |
= ENumber a Scientific | |
| EVar a (Name a) | |
| EString a ByteString | |
| EUnit a | |
| EList a [Exp a] | |
| EPar a (Exp a) | |
| EApp a (Exp a) (Exp a) | |
| EIfThen a (Exp a) (Exp a) | |
| EIfThenElse a (Exp a) (Exp a) (Exp a) | |
| ENeg a (Exp a) | |
| EBinOp a (Exp a) (Operator a) (Exp a) | |
| EOp a (Operator a) | |
| ELetIn a (Dec a) (Exp a) | |
| EProj a (Exp a) (NonEmpty (Proj a)) | |
| EMatch a (Exp a) [Case a] | |
deriving (Foldable, Show) | |
data Pat a | |
= PNumber a Scientific | |
| PVar a (Name a) | |
| PString a ByteString | |
| PUnit a | |
| PPar a (Pat a) | |
| PAnnot a (Pat a) (Type a) | |
| PList a [Pat a] | |
deriving (Foldable, Show) | |
} |
Hey, @Kleidukos, I just tried these snippets with the latest nightly LTS (2022-09-17), and those versions of Alex and Happy:
❯ stack ls dependencies | grep -E 'alex|happy'
Stack has not been tested with GHC versions above 9.0, and using 9.2.4, this may fail
Stack has not been tested with Cabal versions above 3.4, but version 3.6.3.0 was found, this may fail
alex 3.2.7.1
happy 1.20.0
It seems to work for me:
λ= [1]: runAlex "let example = 42" parseMiniML
Out [1]:
Right [Dec (Range {start = AlexPn 0 1 1, stop = AlexPn 16 1 17}) (Name (Range {start = AlexPn 4 1 5, stop = AlexPn 11 1 12}) "example") [] Nothing (ENumber (Range {start = AlexPn 14 1 15, stop = AlexPn 16 1 17}) 42.0)]
But I have one question: are you compiling your Happy module using StrictData
? Happy does not like this extension and will fail in the same way if you use it:
λ= [1]: runAlex "let example = 42" parseMiniML
Out [1]:
*** Exception: Internal Happy error
CallStack (from HasCallStack):
error, called at /home/heitor/Projects/alex-and-happy/.stack-work/dist/x86_64-linux-tinfo6/Cabal-3.6.3.0/build/Parser.hs:1737:17 in main:Parser
@heitor-lassarote Indeed! It was in my *.cabal file and so I have added NoStrictData
in the pragmas. :)
Thanks a lot!!
@Kleidukos, no problem. I've made an edit to the article warning about StrictData
that hopefully will be merged soon, as it seems to be a common pitfall. I also did the same thing and got confused about the error in the past, hence I asked. :) One of my friends also fell into the same pitfall.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@heitor-lassarote Hi! I'm using these snippets with GHC 9.2.4 and the latest versions of Alex & Happy (though they haven't changed much in the last years). Unfortunately,
happy
is not very satisfied with it:I opened a ticket here but I was wondering if you knew anything about why?