Last active
August 2, 2017 21:32
-
-
Save Spriithy/89bd33c78c2468bbb6b70e6c310f42d5 to your computer and use it in GitHub Desktop.
Primitive tokenizer in Haskell to learn the language
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module Lex ( | |
Token(..), | |
tokKind, | |
tokText, | |
tokPos, | |
Kind(..), | |
doLex, | |
) where | |
import Data.Char | |
import Text.Printf | |
import Text.Regex.TDFA | |
data Kind | |
= Invalid | |
| Skip | |
| EOL | |
| EOF | |
-- Actual token kinds | |
| Name | |
| Anonymous | |
| IntLit | |
| FloatLit | |
| Keyword | |
| Delimiter | |
| Operator | |
deriving (Show, Eq) | |
data Token = Token { line, col :: Int | |
, kind :: Kind | |
, text :: String | |
} | |
instance Show Token where | |
show tok = printf "l(%02d) c(%02d) %s %s" (line tok) (col tok) (show $ kind tok) (show $ text tok) | |
tokKind :: Token -> Kind | |
tokKind t = kind t | |
tokText :: Token -> String | |
tokText t = text t | |
tokPos :: Token -> (Int, Int) | |
tokPos t = (line t, col t) | |
keywords :: [String] | |
keywords | |
= [ | |
] | |
operators :: [String] | |
operators = [ | |
"+", "++", "+=", | |
"-", "--", "-=", "->", | |
"*", "*=", | |
"/", "/=", | |
"%", "%=", | |
">", ">>", ">=", ">>=", | |
"<", "<<", "<=", "<<=", "<-", | |
"!", "!!", "!=", | |
"|", "||", "|=", | |
"&", "&&", "&=", | |
"~", "~=", | |
"^", "^=", | |
"@", "=" | |
] | |
delimiters :: [String] | |
delimiters = [ | |
".", ",", ";", ":", "::", "...", "..", | |
"(", ")", "[", "]", "{", "}" | |
] | |
wrap :: Kind -> String -> Int -> Int -> Token | |
wrap k t l c = Token { line=l, col=c, kind=k, text=t } | |
kindOf :: String -> Kind | |
kindOf [ch] | |
| isDigit ch = IntLit | |
| isLetter ch = Name | |
| ch == '_' = Anonymous | |
| ch == '\n' = EOL | |
| isSpace ch = Skip -- \n is a space, but caught before | |
| isOp [ch] = Operator | |
| isDelim [ch] = Delimiter | |
| otherwise = Invalid | |
where isDelim str = str `elem` delimiters | |
kindOf str | |
| str `elem` keywords = Keyword | |
| str `elem` operators = Operator | |
| str `elem` delimiters = Delimiter | |
| isFloatLit str = FloatLit | |
| isIntLit str = IntLit | |
| isIdent str = Name | |
| isOp str = Operator | |
| otherwise = Invalid | |
where isIntLit = (\ s -> s =~ "(0[xX][0-9a-fA-F]+|[1-9]([0-9]+)?)") | |
isFloatLit = (\ s -> s =~ "([0-9]+\\.([0-9]+)?|([0-9]+)?\\.[0-9]+)") | |
isIdent "" = False | |
isIdent s = s == (nextMatch str "[_a-zA-Z]" "[_a-zA-Z]([a-zA-Z0-9_]+)?") | |
tokenize :: String -> Int -> Int -> [Token] | |
tokenize [] l c = [wrap EOF "" l c] | |
tokenize (ch:[]) l c | |
| kch == EOL = [wrap EOF "" (l + 1) c] | |
| kch == Skip = [wrap EOF "" l (c + 1)] | |
| kch == Invalid = error $ "Unexpected character before end of file: '" ++ [ch] ++ "'" | |
| otherwise = [wrap kch [ch] l c, wrap EOF "" l (c + 1)] | |
where kch = kindOf [ch] | |
tokenize str@(ch:chs) l c | |
| kch == EOL = tokenize chs (l + 1) 1 | |
| kch == Skip = tokenize chs l (c + 1) | |
| identS ch = (wrap (kindOf ident) ident l c):(tokenize (drop (length ident - 1) chs) l (c + length ident)) | |
| kch == Operator = (wrap Operator op l c):(tokenize (drop (length op) str) l (c + length op)) | |
| kch == IntLit = (wrap tnum num l c):(tokenize (drop (length num) str) l (c + length num)) | |
| kch == Delimiter = | |
if (ch == '.') && (isDigit $ head chs) then | |
(wrap FloatLit num l c):(tokenize (drop (length num - 1) str) l (c + length num)) | |
else | |
(wrap Delimiter [ch] l c):(tokenize chs l (c + 1)) | |
| otherwise = error $ "Unexpected character: '" ++ [ch] ++ "' at line " ++ show l ++ ", column " ++ show c | |
where kch = kindOf [ch] | |
identS = (\ k -> isLetter k || k == '_') | |
identP = (\ k -> identS k || isDigit k) | |
ident = ch:(takeWhile identP chs) | |
op = nextOp str | |
(tnum, num) = nextNumber str | |
doLex :: String -> [Token] | |
doLex s | |
| length s > 0 = tokenize s 1 1 | |
| otherwise = error "Nothing to lex" | |
-- Used to retrieve the longest matching string given 2 patterns: | |
-- One to identify the head, one for the tail of any matching string | |
-- Source head tail Result | |
nextMatch :: String -> String -> String -> String | |
nextMatch [ch] hd _ | |
| [ch] =~ hd = [ch] | |
| otherwise = "" | |
nextMatch (ch:str) hd tl | |
| [ch] =~ hd = ch:(nextMatch str tl tl) | |
| otherwise = "" | |
nextMatch _ _ _ = "" | |
nextNumber :: String -> (Kind, String) | |
nextNumber [ch] = (IntLit, nextMatch [ch] "[0-9]" "") | |
nextNumber (ch:chs) | |
| ch == '0' = | |
if length hex > 1 then | |
(IntLit, ch:hex) | |
else | |
if hcs =~ "[xX]" then | |
error ("Malformed numeric litteral '" ++ [ch] ++ hcs ++ "'") | |
else if hcs == "." then | |
let nnum = snd (nextNumber $ tail chs) in | |
(FloatLit, "0." ++ nnum) | |
else (IntLit, "0") | |
| [ch] =~ "[1-9]" = (IntLit, ch:(nextMatch chs "[0-9]" "[0-9]+")) | |
| ch == '.' = | |
if tnum == IntLit then | |
(FloatLit, '0':ch:num) | |
else | |
error $ "Unexpected float litteral after '.': '" ++ num ++ "'" | |
where (tnum, num) = nextNumber chs | |
hex = nextMatch chs "[xX]" "[0-9a-fA-F]+" | |
hcs = [head chs] | |
nextNumber _ = (Invalid, "") | |
isOp :: String -> Bool | |
isOp o = o `elem` operators | |
nextOp :: String -> String | |
nextOp [ch] | |
| isOp [ch] = [ch] | |
| otherwise = "" | |
nextOp (ch:str) | |
| isOp (ch:la2) = ch:la2 | |
| isOp (ch:la1) = ch:la1 | |
| isOp [ch] = [ch] | |
| otherwise = "" | |
where la1 = [head str] | |
la2 = if length str > 1 then drop 2 str else la1 | |
nextOp _ = "" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment