raek/Regex.hs

## Regex.hs
module Lexis.Regex  where

import Data.Char (ord)
import Data.Foldable (asum)
import Data.SciRatio.Read (readHexP, readIntegerP, readOctP, readSciRationalP)
import Data.Traversable (sequenceA)
import Data.Tuple (swap)
import Text.ParserCombinators.ReadP (ReadP, readP_to_S)
import Text.Regex.Applicative

import Lexis.Tables (keywords, simpleEscapes, symbols)
import Lexis.Types

type Lexer = RE Char

orDefault :: RE s a -> a -> RE s a
r `orDefault` x = r <|> pure x

maybeMissing :: RE s [a] -> RE s [a]
maybeMissing r = r `orDefault` []

(<++>) :: RE s [a] -> RE s [a] -> RE s [a]
(<++>) = liftA2 (++)

(<&>) :: RE s a -> RE s b -> RE s (a, b)
(<&>) = liftA2 (,)

anyOf :: Eq a => [a] -> RE a a
anyOf xs = psym (`elem` xs)

anyBut :: Eq a => [a] -> RE a a
anyBut xs = psym (`notElem` xs)

singleton :: RE s a -> RE s [a]
singleton = fmap (:[])

readp :: ReadP a -> String -> a
readp r s = case readP_to_S r s of
              [(x, _)] -> x
              _        -> error "Lexer.readp"

ch :: Char -> BigChar
ch = fromIntegral . ord

value :: Lexer Value
value = Whitespace                <$> whitespace
    <|> Keyword                   <$> keyword
    <|> Identifier                <$> identifier
    <|> uncurry FloatingConstant  <$> floatingConstant
    <|> uncurry IntegerConstant   <$> integerConstant
    <|> uncurry CharacterConstant <$> characterConstant
    <|> uncurry StringLiteral     <$> stringLiteral
    <|> Symbol                    <$> symbol

whitespace :: Lexer Whitespace
whitespace = some whitespaceCharacter

whitespaceCharacter :: Lexer Char
whitespaceCharacter = anyOf [' ', '\t', '\n', '\v', '\f', '\r']
-- According to the standard, translation phase 1 actually handles the
-- carriage return.

keyword :: Lexer Keyword
keyword = asum (map string keywords)

identifier :: Lexer Identifier
identifier = (:) <$> nondigit <*> many (nondigit <|> digit)

nondigit :: Lexer Char
nondigit = anyOf $ ['_'] ++ ['a'..'z'] ++ ['A'..'Z']

digit :: Lexer Char
digit = anyOf ['0'..'9']

floatingConstant :: Lexer (BigFloat, FloatSuffix)
floatingConstant = convert <$> (fractionalConstant <++> maybeMissing exponentPart)
                           <*> floatingSuffix `orDefault` Float0
               <|> convert <$> (digitSequence <++> exponentPart)
                           <*> floatingSuffix `orDefault` Float0
  where
    convert :: String -> FloatSuffix -> (BigFloat, FloatSuffix)
    convert s t = (readp readSciRationalP s, t)

fractionalConstant :: Lexer String
fractionalConstant = maybeMissing digitSequence
                <++> string "."
                <++> digitSequence

exponentPart :: Lexer String
exponentPart = singleton (anyOf ['e', 'E'])
          <++> maybeMissing sign
          <++> digitSequence

sign :: Lexer String
sign = singleton (anyOf ['+', '-'])

digitSequence :: Lexer String
digitSequence = some digit

floatingSuffix :: Lexer FloatSuffix
floatingSuffix = FloatF <$ anyOf ['f', 'F']
             <|> FloatL <$ anyOf ['l', 'L']

integerConstant :: Lexer (BigInt, IntSuffix)
integerConstant = (decimalConstant
               <|> octalConstant
               <|> hexadecimalConstant)
              <&> (integerSuffix `orDefault` Int0)

decimalConstant :: Lexer BigInt
decimalConstant = readp readIntegerP <$> singleton nonzeroDigit <++> many digit

nonzeroDigit :: Lexer Char
nonzeroDigit = anyOf ['1'..'9']

octalConstant :: Lexer BigInt
octalConstant = readp readOctP <$> singleton (sym '0') <++> many octalDigit

octalDigit :: Lexer Char
octalDigit = anyOf ['0'..'7']

hexadecimalConstant :: Lexer BigInt
hexadecimalConstant = (string "0x" <|> string "0X") *> (readp readHexP <$> many hexadecimalDigit)

hexadecimalDigit :: Lexer Char
hexadecimalDigit = anyOf $ ['0'..'9'] ++ ['a'..'f'] ++ ['A'..'F']

integerSuffix :: Lexer IntSuffix
integerSuffix = convert      <$> uSuffix <*> lSuffix `orDefault` False
            <|> flip convert <$> lSuffix <*> uSuffix `orDefault` False
  where
    uSuffix = True <$ anyOf ['u', 'U']
    lSuffix = True <$ anyOf ['l', 'L']
    --      u    l
    convert False False = Int0
    convert True  False = IntU
    convert False True  = IntL
    convert True  True  = IntUL

characterConstant :: Lexer ([BigChar], CharPrefix)
characterConstant = swap <$> prefix <&> char
    where
    prefix = (CharL <$ sym 'L') `orDefault` Char0
    char   = sym '\'' *> cCharSequence <* sym '\''

cCharSequence :: Lexer [BigChar]
cCharSequence = some cChar

cChar :: Lexer BigChar
cChar = (ch <$> anyBut ['\'', '\\', '\n']) <|> escapeSequence

escapeSequence :: Lexer BigChar
escapeSequence = sym '\\' *> (simpleEscapeSequence
                          <|> octalEscapeSequence
                          <|> hexadecimalEscapeSequence)

simpleEscapeSequence :: Lexer BigChar
simpleEscapeSequence = asum (map makeEscape simpleEscapes)
  where makeEscape (char, val) = ch val <$ sym char

octalEscapeSequence :: Lexer BigChar
octalEscapeSequence = readp readOctP <$> (three <|> two <|> one)
  where
    three = sequenceA [octalDigit, octalDigit, octalDigit]
    two   = sequenceA [octalDigit, octalDigit]
    one   = sequenceA [octalDigit]

hexadecimalEscapeSequence :: Lexer BigChar
hexadecimalEscapeSequence = sym 'x' *> (readp readHexP <$> some hexadecimalDigit)

stringLiteral :: Lexer ([BigChar], CharPrefix)
stringLiteral = swap <$> prefix <&> str
  where
    prefix = (CharL <$ sym 'L') `orDefault` Char0
    str    = sym '"' *> sCharSequence <* sym '"'

sCharSequence :: Lexer [BigChar]
sCharSequence = many sChar

sChar :: Lexer BigChar
sChar = (ch <$> anyBut ['"', '\\', '\n']) <|> escapeSequence

symbol :: Lexer Symbol
symbol = asum (map string symbols)

## Types.hs
module Lexis.Types where

import Data.SciRatio (SciRational)

data Token = Token
    { tokenString :: String
    , tokenRange  :: Range
    , tokenValue  :: Value
    }

instance Show Token where
    show = tokenString

data Range = Range
    { rangeFile  :: FilePath
    , rangeStart :: Location
    , rangeEnd   :: Location
    }

instance Show Range where
    show (Range file (Location l c) _) =
        file ++ ":" ++ show l ++ ":" ++  show c

data Location = Location
    { locationLine   :: Int
    , locationColumn :: Int
    }

data Value = Whitespace        Whitespace
           | Keyword           Keyword
           | Identifier        Identifier
           | FloatingConstant  BigFloat   FloatSuffix
           | IntegerConstant   BigInt     IntSuffix
           | CharacterConstant [BigChar]  CharPrefix
           | StringLiteral     [BigChar]  CharPrefix
           | Symbol            Symbol
           deriving Eq

type Whitespace = String
type Keyword = String
type Identifier = String
type BigFloat = SciRational
type BigInt = Integer
type BigChar = Integer
type Symbol = String

data FloatSuffix = Float0 | FloatF | FloatL deriving (Eq)
data IntSuffix = Int0 | IntL | IntU | IntUL deriving (Eq)
data CharPrefix = Char0 | CharL deriving (Eq)
	module Lexis.Regex where

	import Data.Char (ord)
	import Data.Foldable (asum)
	import Data.SciRatio.Read (readHexP, readIntegerP, readOctP, readSciRationalP)
	import Data.Traversable (sequenceA)
	import Data.Tuple (swap)
	import Text.ParserCombinators.ReadP (ReadP, readP_to_S)
	import Text.Regex.Applicative

	import Lexis.Tables (keywords, simpleEscapes, symbols)
	import Lexis.Types

	type Lexer = RE Char

	orDefault :: RE s a -> a -> RE s a
	r `orDefault` x = r <\|> pure x

	maybeMissing :: RE s [a] -> RE s [a]
	maybeMissing r = r `orDefault` []

	(<++>) :: RE s [a] -> RE s [a] -> RE s [a]
	(<++>) = liftA2 (++)

	(<&>) :: RE s a -> RE s b -> RE s (a, b)
	(<&>) = liftA2 (,)

	anyOf :: Eq a => [a] -> RE a a
	anyOf xs = psym (`elem` xs)

	anyBut :: Eq a => [a] -> RE a a
	anyBut xs = psym (`notElem` xs)

	singleton :: RE s a -> RE s [a]
	singleton = fmap (:[])

	readp :: ReadP a -> String -> a
	readp r s = case readP_to_S r s of
	[(x, _)] -> x
	_ -> error "Lexer.readp"

	ch :: Char -> BigChar
	ch = fromIntegral . ord

	value :: Lexer Value
	value = Whitespace <$> whitespace
	<\|> Keyword <$> keyword
	<\|> Identifier <$> identifier
	<\|> uncurry FloatingConstant <$> floatingConstant
	<\|> uncurry IntegerConstant <$> integerConstant
	<\|> uncurry CharacterConstant <$> characterConstant
	<\|> uncurry StringLiteral <$> stringLiteral
	<\|> Symbol <$> symbol

	whitespace :: Lexer Whitespace
	whitespace = some whitespaceCharacter

	whitespaceCharacter :: Lexer Char
	whitespaceCharacter = anyOf [' ', '\t', '\n', '\v', '\f', '\r']
	-- According to the standard, translation phase 1 actually handles the
	-- carriage return.

	keyword :: Lexer Keyword
	keyword = asum (map string keywords)

	identifier :: Lexer Identifier
	identifier = (:) <$> nondigit <*> many (nondigit <\|> digit)

	nondigit :: Lexer Char
	nondigit = anyOf $ ['_'] ++ ['a'..'z'] ++ ['A'..'Z']

	digit :: Lexer Char
	digit = anyOf ['0'..'9']

	floatingConstant :: Lexer (BigFloat, FloatSuffix)
	floatingConstant = convert <$> (fractionalConstant <++> maybeMissing exponentPart)
	<*> floatingSuffix `orDefault` Float0
	<\|> convert <$> (digitSequence <++> exponentPart)
	<*> floatingSuffix `orDefault` Float0
	where
	convert :: String -> FloatSuffix -> (BigFloat, FloatSuffix)
	convert s t = (readp readSciRationalP s, t)

	fractionalConstant :: Lexer String
	fractionalConstant = maybeMissing digitSequence
	<++> string "."
	<++> digitSequence

	exponentPart :: Lexer String
	exponentPart = singleton (anyOf ['e', 'E'])
	<++> maybeMissing sign
	<++> digitSequence

	sign :: Lexer String
	sign = singleton (anyOf ['+', '-'])

	digitSequence :: Lexer String
	digitSequence = some digit

	floatingSuffix :: Lexer FloatSuffix
	floatingSuffix = FloatF <$ anyOf ['f', 'F']
	<\|> FloatL <$ anyOf ['l', 'L']

	integerConstant :: Lexer (BigInt, IntSuffix)
	integerConstant = (decimalConstant
	<\|> octalConstant
	<\|> hexadecimalConstant)
	<&> (integerSuffix `orDefault` Int0)

	decimalConstant :: Lexer BigInt
	decimalConstant = readp readIntegerP <$> singleton nonzeroDigit <++> many digit

	nonzeroDigit :: Lexer Char
	nonzeroDigit = anyOf ['1'..'9']

	octalConstant :: Lexer BigInt
	octalConstant = readp readOctP <$> singleton (sym '0') <++> many octalDigit

	octalDigit :: Lexer Char
	octalDigit = anyOf ['0'..'7']

	hexadecimalConstant :: Lexer BigInt
	hexadecimalConstant = (string "0x" <\|> string "0X") *> (readp readHexP <$> many hexadecimalDigit)

	hexadecimalDigit :: Lexer Char
	hexadecimalDigit = anyOf $ ['0'..'9'] ++ ['a'..'f'] ++ ['A'..'F']

	integerSuffix :: Lexer IntSuffix
	integerSuffix = convert <$> uSuffix <*> lSuffix `orDefault` False
	<\|> flip convert <$> lSuffix <*> uSuffix `orDefault` False
	where
	uSuffix = True <$ anyOf ['u', 'U']
	lSuffix = True <$ anyOf ['l', 'L']
	-- u l
	convert False False = Int0
	convert True False = IntU
	convert False True = IntL
	convert True True = IntUL

	characterConstant :: Lexer ([BigChar], CharPrefix)
	characterConstant = swap <$> prefix <&> char
	where
	prefix = (CharL <$ sym 'L') `orDefault` Char0
	char = sym '\'' > cCharSequence < sym '\''

	cCharSequence :: Lexer [BigChar]
	cCharSequence = some cChar

	cChar :: Lexer BigChar
	cChar = (ch <$> anyBut ['\'', '\\', '\n']) <\|> escapeSequence

	escapeSequence :: Lexer BigChar
	escapeSequence = sym '\\' *> (simpleEscapeSequence
	<\|> octalEscapeSequence
	<\|> hexadecimalEscapeSequence)

	simpleEscapeSequence :: Lexer BigChar
	simpleEscapeSequence = asum (map makeEscape simpleEscapes)
	where makeEscape (char, val) = ch val <$ sym char

	octalEscapeSequence :: Lexer BigChar
	octalEscapeSequence = readp readOctP <$> (three <\|> two <\|> one)
	where
	three = sequenceA [octalDigit, octalDigit, octalDigit]
	two = sequenceA [octalDigit, octalDigit]
	one = sequenceA [octalDigit]

	hexadecimalEscapeSequence :: Lexer BigChar
	hexadecimalEscapeSequence = sym 'x' *> (readp readHexP <$> some hexadecimalDigit)

	stringLiteral :: Lexer ([BigChar], CharPrefix)
	stringLiteral = swap <$> prefix <&> str
	where
	prefix = (CharL <$ sym 'L') `orDefault` Char0
	str = sym '"' > sCharSequence < sym '"'

	sCharSequence :: Lexer [BigChar]
	sCharSequence = many sChar

	sChar :: Lexer BigChar
	sChar = (ch <$> anyBut ['"', '\\', '\n']) <\|> escapeSequence

	symbol :: Lexer Symbol
	symbol = asum (map string symbols)
	module Lexis.Types where

	import Data.SciRatio (SciRational)

	data Token = Token
	{ tokenString :: String
	, tokenRange :: Range
	, tokenValue :: Value
	}

	instance Show Token where
	show = tokenString

	data Range = Range
	{ rangeFile :: FilePath
	, rangeStart :: Location
	, rangeEnd :: Location
	}

	instance Show Range where
	show (Range file (Location l c) _) =
	file ++ ":" ++ show l ++ ":" ++ show c

	data Location = Location
	{ locationLine :: Int
	, locationColumn :: Int
	}

	data Value = Whitespace Whitespace
	\| Keyword Keyword
	\| Identifier Identifier
	\| FloatingConstant BigFloat FloatSuffix
	\| IntegerConstant BigInt IntSuffix
	\| CharacterConstant [BigChar] CharPrefix
	\| StringLiteral [BigChar] CharPrefix
	\| Symbol Symbol
	deriving Eq

	type Whitespace = String
	type Keyword = String
	type Identifier = String
	type BigFloat = SciRational
	type BigInt = Integer
	type BigChar = Integer
	type Symbol = String

	data FloatSuffix = Float0 \| FloatF \| FloatL deriving (Eq)
	data IntSuffix = Int0 \| IntL \| IntU \| IntUL deriving (Eq)
	data CharPrefix = Char0 \| CharL deriving (Eq)