Skip to content

Instantly share code, notes, and snippets.

@Spriithy
Last active August 2, 2017 21:32
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Spriithy/89bd33c78c2468bbb6b70e6c310f42d5 to your computer and use it in GitHub Desktop.
Save Spriithy/89bd33c78c2468bbb6b70e6c310f42d5 to your computer and use it in GitHub Desktop.
Primitive tokenizer in Haskell to learn the language
module Lex (
Token(..),
tokKind,
tokText,
tokPos,
Kind(..),
doLex,
) where
import Data.Char
import Text.Printf
import Text.Regex.TDFA
data Kind
= Invalid
| Skip
| EOL
| EOF
-- Actual token kinds
| Name
| Anonymous
| IntLit
| FloatLit
| Keyword
| Delimiter
| Operator
deriving (Show, Eq)
data Token = Token { line, col :: Int
, kind :: Kind
, text :: String
}
instance Show Token where
show tok = printf "l(%02d) c(%02d) %s %s" (line tok) (col tok) (show $ kind tok) (show $ text tok)
tokKind :: Token -> Kind
tokKind t = kind t
tokText :: Token -> String
tokText t = text t
tokPos :: Token -> (Int, Int)
tokPos t = (line t, col t)
keywords :: [String]
keywords
= [
]
operators :: [String]
operators = [
"+", "++", "+=",
"-", "--", "-=", "->",
"*", "*=",
"/", "/=",
"%", "%=",
">", ">>", ">=", ">>=",
"<", "<<", "<=", "<<=", "<-",
"!", "!!", "!=",
"|", "||", "|=",
"&", "&&", "&=",
"~", "~=",
"^", "^=",
"@", "="
]
delimiters :: [String]
delimiters = [
".", ",", ";", ":", "::", "...", "..",
"(", ")", "[", "]", "{", "}"
]
wrap :: Kind -> String -> Int -> Int -> Token
wrap k t l c = Token { line=l, col=c, kind=k, text=t }
kindOf :: String -> Kind
kindOf [ch]
| isDigit ch = IntLit
| isLetter ch = Name
| ch == '_' = Anonymous
| ch == '\n' = EOL
| isSpace ch = Skip -- \n is a space, but caught before
| isOp [ch] = Operator
| isDelim [ch] = Delimiter
| otherwise = Invalid
where isDelim str = str `elem` delimiters
kindOf str
| str `elem` keywords = Keyword
| str `elem` operators = Operator
| str `elem` delimiters = Delimiter
| isFloatLit str = FloatLit
| isIntLit str = IntLit
| isIdent str = Name
| isOp str = Operator
| otherwise = Invalid
where isIntLit = (\ s -> s =~ "(0[xX][0-9a-fA-F]+|[1-9]([0-9]+)?)")
isFloatLit = (\ s -> s =~ "([0-9]+\\.([0-9]+)?|([0-9]+)?\\.[0-9]+)")
isIdent "" = False
isIdent s = s == (nextMatch str "[_a-zA-Z]" "[_a-zA-Z]([a-zA-Z0-9_]+)?")
tokenize :: String -> Int -> Int -> [Token]
tokenize [] l c = [wrap EOF "" l c]
tokenize (ch:[]) l c
| kch == EOL = [wrap EOF "" (l + 1) c]
| kch == Skip = [wrap EOF "" l (c + 1)]
| kch == Invalid = error $ "Unexpected character before end of file: '" ++ [ch] ++ "'"
| otherwise = [wrap kch [ch] l c, wrap EOF "" l (c + 1)]
where kch = kindOf [ch]
tokenize str@(ch:chs) l c
| kch == EOL = tokenize chs (l + 1) 1
| kch == Skip = tokenize chs l (c + 1)
| identS ch = (wrap (kindOf ident) ident l c):(tokenize (drop (length ident - 1) chs) l (c + length ident))
| kch == Operator = (wrap Operator op l c):(tokenize (drop (length op) str) l (c + length op))
| kch == IntLit = (wrap tnum num l c):(tokenize (drop (length num) str) l (c + length num))
| kch == Delimiter =
if (ch == '.') && (isDigit $ head chs) then
(wrap FloatLit num l c):(tokenize (drop (length num - 1) str) l (c + length num))
else
(wrap Delimiter [ch] l c):(tokenize chs l (c + 1))
| otherwise = error $ "Unexpected character: '" ++ [ch] ++ "' at line " ++ show l ++ ", column " ++ show c
where kch = kindOf [ch]
identS = (\ k -> isLetter k || k == '_')
identP = (\ k -> identS k || isDigit k)
ident = ch:(takeWhile identP chs)
op = nextOp str
(tnum, num) = nextNumber str
doLex :: String -> [Token]
doLex s
| length s > 0 = tokenize s 1 1
| otherwise = error "Nothing to lex"
-- Used to retrieve the longest matching string given 2 patterns:
-- One to identify the head, one for the tail of any matching string
-- Source head tail Result
nextMatch :: String -> String -> String -> String
nextMatch [ch] hd _
| [ch] =~ hd = [ch]
| otherwise = ""
nextMatch (ch:str) hd tl
| [ch] =~ hd = ch:(nextMatch str tl tl)
| otherwise = ""
nextMatch _ _ _ = ""
nextNumber :: String -> (Kind, String)
nextNumber [ch] = (IntLit, nextMatch [ch] "[0-9]" "")
nextNumber (ch:chs)
| ch == '0' =
if length hex > 1 then
(IntLit, ch:hex)
else
if hcs =~ "[xX]" then
error ("Malformed numeric litteral '" ++ [ch] ++ hcs ++ "'")
else if hcs == "." then
let nnum = snd (nextNumber $ tail chs) in
(FloatLit, "0." ++ nnum)
else (IntLit, "0")
| [ch] =~ "[1-9]" = (IntLit, ch:(nextMatch chs "[0-9]" "[0-9]+"))
| ch == '.' =
if tnum == IntLit then
(FloatLit, '0':ch:num)
else
error $ "Unexpected float litteral after '.': '" ++ num ++ "'"
where (tnum, num) = nextNumber chs
hex = nextMatch chs "[xX]" "[0-9a-fA-F]+"
hcs = [head chs]
nextNumber _ = (Invalid, "")
isOp :: String -> Bool
isOp o = o `elem` operators
nextOp :: String -> String
nextOp [ch]
| isOp [ch] = [ch]
| otherwise = ""
nextOp (ch:str)
| isOp (ch:la2) = ch:la2
| isOp (ch:la1) = ch:la1
| isOp [ch] = [ch]
| otherwise = ""
where la1 = [head str]
la2 = if length str > 1 then drop 2 str else la1
nextOp _ = ""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment