Spriithy/Lex.hs

## Lex.hs
module Lex (
  Token(..),
  tokKind,
  tokText,
  tokPos,
  Kind(..),
  doLex,
) where

import           Data.Char
import           Text.Printf
import           Text.Regex.TDFA

data Kind
  = Invalid
  | Skip
  | EOL
  | EOF
  -- Actual token kinds
  | Name
  | Anonymous
  | IntLit
  | FloatLit
  | Keyword
  | Delimiter
  | Operator
  deriving (Show, Eq)

data Token = Token { line, col :: Int
                   , kind      :: Kind
                   , text      :: String
                   }

instance Show Token where
  show tok = printf "l(%02d) c(%02d) %s %s" (line tok) (col tok) (show $ kind tok) (show $ text tok)

tokKind :: Token -> Kind
tokKind t = kind t

tokText :: Token -> String
tokText t = text t

tokPos :: Token -> (Int, Int)
tokPos t = (line t, col t)

keywords :: [String]
keywords
  = [

  ]

operators :: [String]
operators = [
  "+", "++", "+=",
  "-", "--", "-=", "->",
  "*",       "*=",
  "/",       "/=",
  "%",       "%=",
  ">", ">>", ">=", ">>=",
  "<", "<<", "<=", "<<=", "<-",
  "!", "!!", "!=",
  "|", "||", "|=",
  "&", "&&", "&=",
  "~",       "~=",
  "^",       "^=",
  "@",        "="
  ]

delimiters :: [String]
delimiters = [
    ".", ",", ";", ":", "::", "...", "..",
    "(", ")", "[", "]", "{",  "}"
  ]

wrap :: Kind -> String -> Int -> Int -> Token
wrap k t l c = Token { line=l, col=c, kind=k, text=t }

kindOf :: String -> Kind
kindOf [ch]
  | isDigit ch    = IntLit
  | isLetter ch   = Name
  | ch == '_'     = Anonymous
  | ch == '\n'    = EOL
  | isSpace ch    = Skip -- \n is a space, but caught before
  | isOp [ch]     = Operator
  | isDelim [ch]  = Delimiter
  | otherwise     = Invalid
  where isDelim str   = str `elem` delimiters
kindOf str
  | str `elem` keywords   = Keyword
  | str `elem` operators  = Operator
  | str `elem` delimiters = Delimiter
  | isFloatLit str        = FloatLit
  | isIntLit str          = IntLit
  | isIdent str           = Name
  | isOp str              = Operator
  | otherwise             = Invalid
  where isIntLit      = (\ s -> s =~ "(0[xX][0-9a-fA-F]+|[1-9]([0-9]+)?)")
        isFloatLit    = (\ s -> s =~ "([0-9]+\\.([0-9]+)?|([0-9]+)?\\.[0-9]+)")
        isIdent ""    = False
        isIdent s     = s == (nextMatch str "[_a-zA-Z]" "[_a-zA-Z]([a-zA-Z0-9_]+)?")

tokenize :: String -> Int -> Int -> [Token]
tokenize [] l c     = [wrap EOF "" l c]
tokenize (ch:[]) l c
  | kch == EOL      = [wrap EOF "" (l + 1) c]
  | kch == Skip     = [wrap EOF "" l (c + 1)]
  | kch == Invalid  = error $ "Unexpected character before end of file: '" ++ [ch] ++ "'"
  | otherwise       = [wrap kch [ch] l c, wrap EOF "" l (c + 1)]
    where kch = kindOf [ch]
tokenize str@(ch:chs) l c
  | kch == EOL       = tokenize chs (l + 1) 1
  | kch == Skip      = tokenize chs l (c + 1)
  | identS ch        = (wrap (kindOf ident) ident l c):(tokenize (drop (length ident - 1) chs) l (c + length ident))
  | kch == Operator  = (wrap Operator op l c):(tokenize (drop (length op) str) l (c + length op))
  | kch == IntLit    = (wrap tnum num l c):(tokenize (drop (length num) str) l (c + length num))
  | kch == Delimiter =
    if (ch == '.') && (isDigit $ head chs) then
      (wrap FloatLit num l c):(tokenize (drop (length num - 1) str) l (c + length num))
    else
      (wrap Delimiter [ch] l c):(tokenize chs l (c + 1))

  | otherwise        = error $ "Unexpected character: '" ++ [ch] ++ "' at line " ++ show l ++ ", column " ++ show c
    where kch    = kindOf [ch]
          identS = (\ k -> isLetter k || k == '_')
          identP = (\ k -> identS k || isDigit k)
          ident  = ch:(takeWhile identP chs)
          op     = nextOp str
          (tnum, num) = nextNumber str

doLex :: String -> [Token]
doLex s
  | length s > 0 = tokenize s 1 1
  | otherwise    = error "Nothing to lex"


-- Used to retrieve the longest matching string given 2 patterns:
-- One to identify the head, one for the tail of any matching string
--           Source     head      tail     Result
nextMatch :: String -> String -> String -> String
nextMatch [ch] hd _
  | [ch] =~ hd  = [ch]
  | otherwise   = ""
nextMatch (ch:str) hd tl
  | [ch] =~ hd  = ch:(nextMatch str tl tl)
  | otherwise   = ""
nextMatch _ _ _ = ""

nextNumber :: String -> (Kind, String)
nextNumber [ch] = (IntLit, nextMatch [ch] "[0-9]" "")
nextNumber (ch:chs)
  | ch == '0' =
    if length hex > 1 then
      (IntLit, ch:hex)
    else
      if hcs =~ "[xX]" then
        error ("Malformed numeric litteral '" ++ [ch] ++ hcs ++ "'")
      else if hcs == "." then
        let nnum = snd (nextNumber $ tail chs) in
        (FloatLit, "0." ++ nnum)
      else (IntLit, "0")
  | [ch] =~ "[1-9]" = (IntLit, ch:(nextMatch chs "[0-9]" "[0-9]+"))
  | ch == '.' =
    if tnum == IntLit then
      (FloatLit, '0':ch:num)
    else
      error $ "Unexpected float litteral after '.': '" ++ num ++ "'"
    where (tnum, num) = nextNumber chs
          hex = nextMatch chs "[xX]" "[0-9a-fA-F]+"
          hcs = [head chs]
nextNumber _ = (Invalid, "")

isOp :: String -> Bool
isOp o = o `elem` operators

nextOp :: String -> String
nextOp [ch]
  | isOp [ch]     = [ch]
  | otherwise     = ""
nextOp (ch:str)
  | isOp (ch:la2) = ch:la2
  | isOp (ch:la1) = ch:la1
  | isOp [ch]     = [ch]
  | otherwise     = ""
  where la1 = [head str]
        la2 = if length str > 1 then drop 2 str else la1
nextOp _ = ""
	module Lex (
	Token(..),
	tokKind,
	tokText,
	tokPos,
	Kind(..),
	doLex,
	) where

	import Data.Char
	import Text.Printf
	import Text.Regex.TDFA

	data Kind
	= Invalid
	\| Skip
	\| EOL
	\| EOF
	-- Actual token kinds
	\| Name
	\| Anonymous
	\| IntLit
	\| FloatLit
	\| Keyword
	\| Delimiter
	\| Operator
	deriving (Show, Eq)

	data Token = Token { line, col :: Int
	, kind :: Kind
	, text :: String
	}

	instance Show Token where
	show tok = printf "l(%02d) c(%02d) %s %s" (line tok) (col tok) (show $ kind tok) (show $ text tok)

	tokKind :: Token -> Kind
	tokKind t = kind t

	tokText :: Token -> String
	tokText t = text t

	tokPos :: Token -> (Int, Int)
	tokPos t = (line t, col t)

	keywords :: [String]
	keywords
	= [

	]

	operators :: [String]
	operators = [
	"+", "++", "+=",
	"-", "--", "-=", "->",
	"", "=",
	"/", "/=",
	"%", "%=",
	">", ">>", ">=", ">>=",
	"<", "<<", "<=", "<<=", "<-",
	"!", "!!", "!=",
	"\|", "\|\|", "\|=",
	"&", "&&", "&=",
	"~", "~=",
	"^", "^=",
	"@", "="
	]

	delimiters :: [String]
	delimiters = [
	".", ",", ";", ":", "::", "...", "..",
	"(", ")", "[", "]", "{", "}"
	]

	wrap :: Kind -> String -> Int -> Int -> Token
	wrap k t l c = Token { line=l, col=c, kind=k, text=t }

	kindOf :: String -> Kind
	kindOf [ch]
	\| isDigit ch = IntLit
	\| isLetter ch = Name
	\| ch == '_' = Anonymous
	\| ch == '\n' = EOL
	\| isSpace ch = Skip -- \n is a space, but caught before
	\| isOp [ch] = Operator
	\| isDelim [ch] = Delimiter
	\| otherwise = Invalid
	where isDelim str = str `elem` delimiters
	kindOf str
	\| str `elem` keywords = Keyword
	\| str `elem` operators = Operator
	\| str `elem` delimiters = Delimiter
	\| isFloatLit str = FloatLit
	\| isIntLit str = IntLit
	\| isIdent str = Name
	\| isOp str = Operator
	\| otherwise = Invalid
	where isIntLit = (\ s -> s =~ "(0[xX][0-9a-fA-F]+\|[1-9]([0-9]+)?)")
	isFloatLit = (\ s -> s =~ "([0-9]+\\.([0-9]+)?\|([0-9]+)?\\.[0-9]+)")
	isIdent "" = False
	isIdent s = s == (nextMatch str "[_a-zA-Z]" "[_a-zA-Z]([a-zA-Z0-9_]+)?")

	tokenize :: String -> Int -> Int -> [Token]
	tokenize [] l c = [wrap EOF "" l c]
	tokenize (ch:[]) l c
	\| kch == EOL = [wrap EOF "" (l + 1) c]
	\| kch == Skip = [wrap EOF "" l (c + 1)]
	\| kch == Invalid = error $ "Unexpected character before end of file: '" ++ [ch] ++ "'"
	\| otherwise = [wrap kch [ch] l c, wrap EOF "" l (c + 1)]
	where kch = kindOf [ch]
	tokenize str@(ch:chs) l c
	\| kch == EOL = tokenize chs (l + 1) 1
	\| kch == Skip = tokenize chs l (c + 1)
	\| identS ch = (wrap (kindOf ident) ident l c):(tokenize (drop (length ident - 1) chs) l (c + length ident))
	\| kch == Operator = (wrap Operator op l c):(tokenize (drop (length op) str) l (c + length op))
	\| kch == IntLit = (wrap tnum num l c):(tokenize (drop (length num) str) l (c + length num))
	\| kch == Delimiter =
	if (ch == '.') && (isDigit $ head chs) then
	(wrap FloatLit num l c):(tokenize (drop (length num - 1) str) l (c + length num))
	else
	(wrap Delimiter [ch] l c):(tokenize chs l (c + 1))

	\| otherwise = error $ "Unexpected character: '" ++ [ch] ++ "' at line " ++ show l ++ ", column " ++ show c
	where kch = kindOf [ch]
	identS = (\ k -> isLetter k \|\| k == '_')
	identP = (\ k -> identS k \|\| isDigit k)
	ident = ch:(takeWhile identP chs)
	op = nextOp str
	(tnum, num) = nextNumber str

	doLex :: String -> [Token]
	doLex s
	\| length s > 0 = tokenize s 1 1
	\| otherwise = error "Nothing to lex"


	-- Used to retrieve the longest matching string given 2 patterns:
	-- One to identify the head, one for the tail of any matching string
	-- Source head tail Result
	nextMatch :: String -> String -> String -> String
	nextMatch [ch] hd _
	\| [ch] =~ hd = [ch]
	\| otherwise = ""
	nextMatch (ch:str) hd tl
	\| [ch] =~ hd = ch:(nextMatch str tl tl)
	\| otherwise = ""
	nextMatch _ _ _ = ""

	nextNumber :: String -> (Kind, String)
	nextNumber [ch] = (IntLit, nextMatch [ch] "[0-9]" "")
	nextNumber (ch:chs)
	\| ch == '0' =
	if length hex > 1 then
	(IntLit, ch:hex)
	else
	if hcs =~ "[xX]" then
	error ("Malformed numeric litteral '" ++ [ch] ++ hcs ++ "'")
	else if hcs == "." then
	let nnum = snd (nextNumber $ tail chs) in
	(FloatLit, "0." ++ nnum)
	else (IntLit, "0")
	\| [ch] =~ "[1-9]" = (IntLit, ch:(nextMatch chs "[0-9]" "[0-9]+"))
	\| ch == '.' =
	if tnum == IntLit then
	(FloatLit, '0':ch:num)
	else
	error $ "Unexpected float litteral after '.': '" ++ num ++ "'"
	where (tnum, num) = nextNumber chs
	hex = nextMatch chs "[xX]" "[0-9a-fA-F]+"
	hcs = [head chs]
	nextNumber _ = (Invalid, "")

	isOp :: String -> Bool
	isOp o = o `elem` operators

	nextOp :: String -> String
	nextOp [ch]
	\| isOp [ch] = [ch]
	\| otherwise = ""
	nextOp (ch:str)
	\| isOp (ch:la2) = ch:la2
	\| isOp (ch:la1) = ch:la1
	\| isOp [ch] = [ch]
	\| otherwise = ""
	where la1 = [head str]
	la2 = if length str > 1 then drop 2 str else la1
	nextOp _ = ""