Skip to content

Instantly share code, notes, and snippets.

@osa1
Created May 19, 2012 13:13
Show Gist options
  • Save osa1/2730819 to your computer and use it in GitHub Desktop.
Save osa1/2730819 to your computer and use it in GitHub Desktop.
hand-written lexer for EtuLang (a hypothetical PL for a PL course) in Haskell
{-# OPTIONS_GHC -Wall #-}
module Lexer where
import Prelude hiding (lex)
import Control.Monad.State
import qualified Data.Set as S
import Char (toLower)
data LexemeClass
= LLParen
| LRParen
| LAssign
| LAdd
| LSub
| LDiv
| LMult
| LSemicolon
| LNoteq
| LGreater
| LLess
| LGreq
| LLesseq
| LComma
| LColon
| LId
| LInt
| LKeyword
| LEOF
deriving (Show, Eq)
data Token = Token LexemeClass Pos String deriving (Show)
data Pos = Pos { line :: Int , col :: Int } deriving (Show)
data LexerState = LexerState
{ currentPos :: Pos
, rest :: !String
, curLexeme :: String
} deriving (Show)
type LexerError = (String, Pos)
type Lexer = State LexerState (Either LexerError Token)
keywords :: S.Set String
keywords = S.fromList [
"begin", "end", "if", "then", "else", "while", "program", "integer"
]
initLexerState :: String -> LexerState
initLexerState s = LexerState (Pos 0 0) s []
runLexer :: Lexer -> LexerState -> (Either LexerError Token, LexerState)
runLexer = runState
evalLexer :: Lexer -> LexerState -> Either LexerError Token
evalLexer = evalState
skipComment :: LexerState -> Either LexerError LexerState
skipComment (LexerState p [] _) = Left $ ("EOF while reading comment", p)
skipComment (LexerState (Pos l c) (f:r) _)
| f == '%' = Right $ LexerState (Pos l (c+1)) r ""
| f == '\n' = skipComment $ LexerState (Pos (l+1) 0) r ""
| otherwise = skipComment $ LexerState (Pos l (c+1)) r ""
readToken :: [Char] -> LexemeClass -> Lexer
readToken chars cls = do
s <- get
case s of
(LexerState p@(Pos l c) [] lexeme) -> do
put (LexerState p [] "")
return $ Right $ Token cls (Pos l (c-(length lexeme))) lexeme
(LexerState (Pos l c) str@(f:r) lexeme)
| toLower f `elem` chars -> do
put (LexerState (Pos l (c+1)) r (lexeme++[f]))
readToken chars cls
| otherwise -> do
let pos' = if f == '\n'
then (Pos (l+1) 0)
else (Pos l c)
put $ LexerState pos' str ""
return $ Right $ Token cls (Pos l (c-(length lexeme))) lexeme
readId :: Lexer
readId = do
t <- readToken (['0'..'9'] ++ ['a'..'z']) LId
case t of
Left err -> return $ Left err
Right tok@(Token _ p s) ->
if S.member s keywords
then return $ Right (Token LKeyword p s)
else return $ Right tok
readInt :: Lexer
readInt = readToken ['0'..'9'] LInt
incCol :: LexerState -> LexerState
incCol (LexerState (Pos l c) (_:r) lm) = LexerState (Pos l (c+1)) r lm
lex' :: Lexer
lex' = do
st@(LexerState p s _) <- get
put $ incCol st
case s of
[] -> return $ Right $ Token LEOF p ""
(f:r) | toLower f `elem` ['a'..'z'] -> readId
| f `elem` ['0'..'9'] -> readInt
| f == '(' -> return $ Right $ Token LLParen p "("
| f == ')' -> return $ Right $ Token LRParen p ")"
| f == ':' -> case r of
('=':_) -> do put $ incCol st
return $ Right $ Token LAssign p ":="
_ -> return $ Right $ Token LColon p ":"
| f == '+' -> return $ Right $ Token LAdd p "+"
| f == '-' -> return $ Right $ Token LSub p "-"
| f == '/' -> return $ Right $ Token LDiv p "/"
| f == '*' -> return $ Right $ Token LMult p "*"
| f == ';' -> return $ Right $ Token LSemicolon p ";"
| f == '<' -> case r of
('>':_) -> do put $ incCol st
return $ Right $ Token LNoteq p "<>"
_ -> return $ Right $ Token LLess p "<"
| f == '>' -> case r of
('=':_) -> do put $ incCol st
return $ Right $ Token LGreq p ">="
_ -> return $ Right $ Token LGreater p ">"
| f == ',' -> return $ Right $ Token LComma p ","
| f == '%' -> do let st' = skipComment st
either (\e -> return $ Left e)
(\ns -> (put ns) >> lex')
st'
| f == ' ' -> (put $ incCol st) >> lex'
| f == '\n' -> (put $ incCol st) >> lex'
| otherwise -> return $ Left ("Unrecognized character.", p)
lex :: String -> Either LexerError [Token]
lex s = lexA (initLexerState s) []
where lexA :: LexerState -> [Token] -> Either LexerError [Token]
lexA state tokens =
let (r,s) = runLexer lex' state
in case r of
Left err -> Left err
Right t@(Token c _ _) ->
if c == LEOF
then Right $ tokens ++ [t]
else lexA s (tokens++[t])
main :: IO ()
main = do
input <- getContents
let tokens = lex input
putStrLn $ show tokens
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment