Skip to content

Instantly share code, notes, and snippets.

@raek
Last active August 29, 2015 14:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save raek/6bfa9b7c9b606f2ead56 to your computer and use it in GitHub Desktop.
Save raek/6bfa9b7c9b606f2ead56 to your computer and use it in GitHub Desktop.
module Lexis.Regex where
import Data.Char (ord)
import Data.Foldable (asum)
import Data.SciRatio.Read (readHexP, readIntegerP, readOctP, readSciRationalP)
import Data.Traversable (sequenceA)
import Data.Tuple (swap)
import Text.ParserCombinators.ReadP (ReadP, readP_to_S)
import Text.Regex.Applicative
import Lexis.Tables (keywords, simpleEscapes, symbols)
import Lexis.Types
type Lexer = RE Char
orDefault :: RE s a -> a -> RE s a
r `orDefault` x = r <|> pure x
maybeMissing :: RE s [a] -> RE s [a]
maybeMissing r = r `orDefault` []
(<++>) :: RE s [a] -> RE s [a] -> RE s [a]
(<++>) = liftA2 (++)
(<&>) :: RE s a -> RE s b -> RE s (a, b)
(<&>) = liftA2 (,)
anyOf :: Eq a => [a] -> RE a a
anyOf xs = psym (`elem` xs)
anyBut :: Eq a => [a] -> RE a a
anyBut xs = psym (`notElem` xs)
singleton :: RE s a -> RE s [a]
singleton = fmap (:[])
readp :: ReadP a -> String -> a
readp r s = case readP_to_S r s of
[(x, _)] -> x
_ -> error "Lexer.readp"
ch :: Char -> BigChar
ch = fromIntegral . ord
value :: Lexer Value
value = Whitespace <$> whitespace
<|> Keyword <$> keyword
<|> Identifier <$> identifier
<|> uncurry FloatingConstant <$> floatingConstant
<|> uncurry IntegerConstant <$> integerConstant
<|> uncurry CharacterConstant <$> characterConstant
<|> uncurry StringLiteral <$> stringLiteral
<|> Symbol <$> symbol
whitespace :: Lexer Whitespace
whitespace = some whitespaceCharacter
whitespaceCharacter :: Lexer Char
whitespaceCharacter = anyOf [' ', '\t', '\n', '\v', '\f', '\r']
-- According to the standard, translation phase 1 actually handles the
-- carriage return.
keyword :: Lexer Keyword
keyword = asum (map string keywords)
identifier :: Lexer Identifier
identifier = (:) <$> nondigit <*> many (nondigit <|> digit)
nondigit :: Lexer Char
nondigit = anyOf $ ['_'] ++ ['a'..'z'] ++ ['A'..'Z']
digit :: Lexer Char
digit = anyOf ['0'..'9']
floatingConstant :: Lexer (BigFloat, FloatSuffix)
floatingConstant = convert <$> (fractionalConstant <++> maybeMissing exponentPart)
<*> floatingSuffix `orDefault` Float0
<|> convert <$> (digitSequence <++> exponentPart)
<*> floatingSuffix `orDefault` Float0
where
convert :: String -> FloatSuffix -> (BigFloat, FloatSuffix)
convert s t = (readp readSciRationalP s, t)
fractionalConstant :: Lexer String
fractionalConstant = maybeMissing digitSequence
<++> string "."
<++> digitSequence
exponentPart :: Lexer String
exponentPart = singleton (anyOf ['e', 'E'])
<++> maybeMissing sign
<++> digitSequence
sign :: Lexer String
sign = singleton (anyOf ['+', '-'])
digitSequence :: Lexer String
digitSequence = some digit
floatingSuffix :: Lexer FloatSuffix
floatingSuffix = FloatF <$ anyOf ['f', 'F']
<|> FloatL <$ anyOf ['l', 'L']
integerConstant :: Lexer (BigInt, IntSuffix)
integerConstant = (decimalConstant
<|> octalConstant
<|> hexadecimalConstant)
<&> (integerSuffix `orDefault` Int0)
decimalConstant :: Lexer BigInt
decimalConstant = readp readIntegerP <$> singleton nonzeroDigit <++> many digit
nonzeroDigit :: Lexer Char
nonzeroDigit = anyOf ['1'..'9']
octalConstant :: Lexer BigInt
octalConstant = readp readOctP <$> singleton (sym '0') <++> many octalDigit
octalDigit :: Lexer Char
octalDigit = anyOf ['0'..'7']
hexadecimalConstant :: Lexer BigInt
hexadecimalConstant = (string "0x" <|> string "0X") *> (readp readHexP <$> many hexadecimalDigit)
hexadecimalDigit :: Lexer Char
hexadecimalDigit = anyOf $ ['0'..'9'] ++ ['a'..'f'] ++ ['A'..'F']
integerSuffix :: Lexer IntSuffix
integerSuffix = convert <$> uSuffix <*> lSuffix `orDefault` False
<|> flip convert <$> lSuffix <*> uSuffix `orDefault` False
where
uSuffix = True <$ anyOf ['u', 'U']
lSuffix = True <$ anyOf ['l', 'L']
-- u l
convert False False = Int0
convert True False = IntU
convert False True = IntL
convert True True = IntUL
characterConstant :: Lexer ([BigChar], CharPrefix)
characterConstant = swap <$> prefix <&> char
where
prefix = (CharL <$ sym 'L') `orDefault` Char0
char = sym '\'' *> cCharSequence <* sym '\''
cCharSequence :: Lexer [BigChar]
cCharSequence = some cChar
cChar :: Lexer BigChar
cChar = (ch <$> anyBut ['\'', '\\', '\n']) <|> escapeSequence
escapeSequence :: Lexer BigChar
escapeSequence = sym '\\' *> (simpleEscapeSequence
<|> octalEscapeSequence
<|> hexadecimalEscapeSequence)
simpleEscapeSequence :: Lexer BigChar
simpleEscapeSequence = asum (map makeEscape simpleEscapes)
where makeEscape (char, val) = ch val <$ sym char
octalEscapeSequence :: Lexer BigChar
octalEscapeSequence = readp readOctP <$> (three <|> two <|> one)
where
three = sequenceA [octalDigit, octalDigit, octalDigit]
two = sequenceA [octalDigit, octalDigit]
one = sequenceA [octalDigit]
hexadecimalEscapeSequence :: Lexer BigChar
hexadecimalEscapeSequence = sym 'x' *> (readp readHexP <$> some hexadecimalDigit)
stringLiteral :: Lexer ([BigChar], CharPrefix)
stringLiteral = swap <$> prefix <&> str
where
prefix = (CharL <$ sym 'L') `orDefault` Char0
str = sym '"' *> sCharSequence <* sym '"'
sCharSequence :: Lexer [BigChar]
sCharSequence = many sChar
sChar :: Lexer BigChar
sChar = (ch <$> anyBut ['"', '\\', '\n']) <|> escapeSequence
symbol :: Lexer Symbol
symbol = asum (map string symbols)
module Lexis.Types where
import Data.SciRatio (SciRational)
data Token = Token
{ tokenString :: String
, tokenRange :: Range
, tokenValue :: Value
}
instance Show Token where
show = tokenString
data Range = Range
{ rangeFile :: FilePath
, rangeStart :: Location
, rangeEnd :: Location
}
instance Show Range where
show (Range file (Location l c) _) =
file ++ ":" ++ show l ++ ":" ++ show c
data Location = Location
{ locationLine :: Int
, locationColumn :: Int
}
data Value = Whitespace Whitespace
| Keyword Keyword
| Identifier Identifier
| FloatingConstant BigFloat FloatSuffix
| IntegerConstant BigInt IntSuffix
| CharacterConstant [BigChar] CharPrefix
| StringLiteral [BigChar] CharPrefix
| Symbol Symbol
deriving Eq
type Whitespace = String
type Keyword = String
type Identifier = String
type BigFloat = SciRational
type BigInt = Integer
type BigChar = Integer
type Symbol = String
data FloatSuffix = Float0 | FloatF | FloatL deriving (Eq)
data IntSuffix = Int0 | IntL | IntU | IntUL deriving (Eq)
data CharPrefix = Char0 | CharL deriving (Eq)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment