Skip to content

Instantly share code, notes, and snippets.

@pedrominicz
Last active March 10, 2023 19:16
Show Gist options
  • Save pedrominicz/df52398a4665314c70b515d35d7a98da to your computer and use it in GitHub Desktop.
Save pedrominicz/df52398a4665314c70b515d35d7a98da to your computer and use it in GitHub Desktop.
Alex: indentation-sensitive lexer
{
module Main (main) where
import Control.Applicative
import Control.Monad.State
import Data.Foldable
import Data.Maybe
import Data.Word
import System.Exit
import Data.ByteString (ByteString)
import qualified Data.ByteString as B
}
$alpha = [A-Za-z]
$white = [\ \t\n\r]
tokens :-
[\n\r] $white* { indent }
[\ \t]+ ;
$alpha+ { word }
{
type Byte = Word8
type AlexInput = ByteString
data ParseState = ParseState
{ input :: AlexInput
, levels :: [Int]
, pending :: [Token]
}
initialState :: ByteString -> ParseState
initialState str = ParseState str [] []
alexGetByte :: AlexInput -> Maybe (Byte, AlexInput)
alexGetByte = B.uncons
data Token
= Word ByteString
| Indent
| Dedent
| Newline
| EOF
deriving (Eq, Show)
type Parse a = StateT ParseState Maybe a
headDef :: a -> [a] -> a
headDef x = fromMaybe x . listToMaybe
indent :: ByteString -> Parse Token
indent str = do
let len = B.length $ B.takeWhileEnd (\c -> c == 32 || c == 9) str
s <- get
let l = headDef 0 (levels s)
when (len > l) $ put s { levels = len : levels s, pending = [Indent] }
when (len < l) $ do
let (pre, post) = span (> len) (levels s)
guard $ headDef 0 post == len
put s { levels = post, pending = map (const Dedent) pre }
return Newline
word :: ByteString -> Parse Token
word str = return (Word str)
token :: Parse Token
token = do
s <- get
case pending s of
tk:tks -> do
put s { pending = tks }
return tk
[] -> case alexScan (input s) 0 of
AlexEOF -> return EOF
AlexError _ -> empty
AlexSkip input _ -> do
put s { input = input }
token
AlexToken input' len action -> do
put s { input = input' }
action (B.take len (input s))
lexer :: ByteString -> Maybe [Token]
lexer = evalStateT go . initialState
where
go :: Parse [Token]
go = do
tk <- token
if tk == EOF
then return []
else (tk :) <$> go
main :: IO ()
main = do
str <- B.getContents
case lexer str of
Nothing -> exitFailure
Just tks -> do
for_ tks $ \tk -> case tk of
Word str -> B.putStr str
Indent -> putChar '('
Dedent -> putChar ')'
_ -> return ()
putChar '\n'
}
#!/usr/bin/env bash
alex Main.x
ghc Main.hs
./Main <<EOF
a
x
y
z
b
x
y
i
j
k
EOF
./a.out <<EOF
a
b
b
EOF
./Main <<EOF
a
a
EOF
./Main <<EOF
a
a
EOF
./Main <<EOF
a
a
EOF
# FIXME: whitespace at the beginning are ignored, so the two examples below are
# interpreted in the exact same way.
./Main <<EOF
a
a
a
EOF
./Main <<EOF
a
a
a
EOF
# FIXME: dedent tokens are not emitted without an ending newline.
printf 'a\n a' |./Main
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment