Last active
August 29, 2015 14:17
-
-
Save raek/6bfa9b7c9b606f2ead56 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module Lexis.Regex where | |
import Data.Char (ord) | |
import Data.Foldable (asum) | |
import Data.SciRatio.Read (readHexP, readIntegerP, readOctP, readSciRationalP) | |
import Data.Traversable (sequenceA) | |
import Data.Tuple (swap) | |
import Text.ParserCombinators.ReadP (ReadP, readP_to_S) | |
import Text.Regex.Applicative | |
import Lexis.Tables (keywords, simpleEscapes, symbols) | |
import Lexis.Types | |
type Lexer = RE Char | |
orDefault :: RE s a -> a -> RE s a | |
r `orDefault` x = r <|> pure x | |
maybeMissing :: RE s [a] -> RE s [a] | |
maybeMissing r = r `orDefault` [] | |
(<++>) :: RE s [a] -> RE s [a] -> RE s [a] | |
(<++>) = liftA2 (++) | |
(<&>) :: RE s a -> RE s b -> RE s (a, b) | |
(<&>) = liftA2 (,) | |
anyOf :: Eq a => [a] -> RE a a | |
anyOf xs = psym (`elem` xs) | |
anyBut :: Eq a => [a] -> RE a a | |
anyBut xs = psym (`notElem` xs) | |
singleton :: RE s a -> RE s [a] | |
singleton = fmap (:[]) | |
readp :: ReadP a -> String -> a | |
readp r s = case readP_to_S r s of | |
[(x, _)] -> x | |
_ -> error "Lexer.readp" | |
ch :: Char -> BigChar | |
ch = fromIntegral . ord | |
value :: Lexer Value | |
value = Whitespace <$> whitespace | |
<|> Keyword <$> keyword | |
<|> Identifier <$> identifier | |
<|> uncurry FloatingConstant <$> floatingConstant | |
<|> uncurry IntegerConstant <$> integerConstant | |
<|> uncurry CharacterConstant <$> characterConstant | |
<|> uncurry StringLiteral <$> stringLiteral | |
<|> Symbol <$> symbol | |
whitespace :: Lexer Whitespace | |
whitespace = some whitespaceCharacter | |
whitespaceCharacter :: Lexer Char | |
whitespaceCharacter = anyOf [' ', '\t', '\n', '\v', '\f', '\r'] | |
-- According to the standard, translation phase 1 actually handles the | |
-- carriage return. | |
keyword :: Lexer Keyword | |
keyword = asum (map string keywords) | |
identifier :: Lexer Identifier | |
identifier = (:) <$> nondigit <*> many (nondigit <|> digit) | |
nondigit :: Lexer Char | |
nondigit = anyOf $ ['_'] ++ ['a'..'z'] ++ ['A'..'Z'] | |
digit :: Lexer Char | |
digit = anyOf ['0'..'9'] | |
floatingConstant :: Lexer (BigFloat, FloatSuffix) | |
floatingConstant = convert <$> (fractionalConstant <++> maybeMissing exponentPart) | |
<*> floatingSuffix `orDefault` Float0 | |
<|> convert <$> (digitSequence <++> exponentPart) | |
<*> floatingSuffix `orDefault` Float0 | |
where | |
convert :: String -> FloatSuffix -> (BigFloat, FloatSuffix) | |
convert s t = (readp readSciRationalP s, t) | |
fractionalConstant :: Lexer String | |
fractionalConstant = maybeMissing digitSequence | |
<++> string "." | |
<++> digitSequence | |
exponentPart :: Lexer String | |
exponentPart = singleton (anyOf ['e', 'E']) | |
<++> maybeMissing sign | |
<++> digitSequence | |
sign :: Lexer String | |
sign = singleton (anyOf ['+', '-']) | |
digitSequence :: Lexer String | |
digitSequence = some digit | |
floatingSuffix :: Lexer FloatSuffix | |
floatingSuffix = FloatF <$ anyOf ['f', 'F'] | |
<|> FloatL <$ anyOf ['l', 'L'] | |
integerConstant :: Lexer (BigInt, IntSuffix) | |
integerConstant = (decimalConstant | |
<|> octalConstant | |
<|> hexadecimalConstant) | |
<&> (integerSuffix `orDefault` Int0) | |
decimalConstant :: Lexer BigInt | |
decimalConstant = readp readIntegerP <$> singleton nonzeroDigit <++> many digit | |
nonzeroDigit :: Lexer Char | |
nonzeroDigit = anyOf ['1'..'9'] | |
octalConstant :: Lexer BigInt | |
octalConstant = readp readOctP <$> singleton (sym '0') <++> many octalDigit | |
octalDigit :: Lexer Char | |
octalDigit = anyOf ['0'..'7'] | |
hexadecimalConstant :: Lexer BigInt | |
hexadecimalConstant = (string "0x" <|> string "0X") *> (readp readHexP <$> many hexadecimalDigit) | |
hexadecimalDigit :: Lexer Char | |
hexadecimalDigit = anyOf $ ['0'..'9'] ++ ['a'..'f'] ++ ['A'..'F'] | |
integerSuffix :: Lexer IntSuffix | |
integerSuffix = convert <$> uSuffix <*> lSuffix `orDefault` False | |
<|> flip convert <$> lSuffix <*> uSuffix `orDefault` False | |
where | |
uSuffix = True <$ anyOf ['u', 'U'] | |
lSuffix = True <$ anyOf ['l', 'L'] | |
-- u l | |
convert False False = Int0 | |
convert True False = IntU | |
convert False True = IntL | |
convert True True = IntUL | |
characterConstant :: Lexer ([BigChar], CharPrefix) | |
characterConstant = swap <$> prefix <&> char | |
where | |
prefix = (CharL <$ sym 'L') `orDefault` Char0 | |
char = sym '\'' *> cCharSequence <* sym '\'' | |
cCharSequence :: Lexer [BigChar] | |
cCharSequence = some cChar | |
cChar :: Lexer BigChar | |
cChar = (ch <$> anyBut ['\'', '\\', '\n']) <|> escapeSequence | |
escapeSequence :: Lexer BigChar | |
escapeSequence = sym '\\' *> (simpleEscapeSequence | |
<|> octalEscapeSequence | |
<|> hexadecimalEscapeSequence) | |
simpleEscapeSequence :: Lexer BigChar | |
simpleEscapeSequence = asum (map makeEscape simpleEscapes) | |
where makeEscape (char, val) = ch val <$ sym char | |
octalEscapeSequence :: Lexer BigChar | |
octalEscapeSequence = readp readOctP <$> (three <|> two <|> one) | |
where | |
three = sequenceA [octalDigit, octalDigit, octalDigit] | |
two = sequenceA [octalDigit, octalDigit] | |
one = sequenceA [octalDigit] | |
hexadecimalEscapeSequence :: Lexer BigChar | |
hexadecimalEscapeSequence = sym 'x' *> (readp readHexP <$> some hexadecimalDigit) | |
stringLiteral :: Lexer ([BigChar], CharPrefix) | |
stringLiteral = swap <$> prefix <&> str | |
where | |
prefix = (CharL <$ sym 'L') `orDefault` Char0 | |
str = sym '"' *> sCharSequence <* sym '"' | |
sCharSequence :: Lexer [BigChar] | |
sCharSequence = many sChar | |
sChar :: Lexer BigChar | |
sChar = (ch <$> anyBut ['"', '\\', '\n']) <|> escapeSequence | |
symbol :: Lexer Symbol | |
symbol = asum (map string symbols) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module Lexis.Types where | |
import Data.SciRatio (SciRational) | |
data Token = Token | |
{ tokenString :: String | |
, tokenRange :: Range | |
, tokenValue :: Value | |
} | |
instance Show Token where | |
show = tokenString | |
data Range = Range | |
{ rangeFile :: FilePath | |
, rangeStart :: Location | |
, rangeEnd :: Location | |
} | |
instance Show Range where | |
show (Range file (Location l c) _) = | |
file ++ ":" ++ show l ++ ":" ++ show c | |
data Location = Location | |
{ locationLine :: Int | |
, locationColumn :: Int | |
} | |
data Value = Whitespace Whitespace | |
| Keyword Keyword | |
| Identifier Identifier | |
| FloatingConstant BigFloat FloatSuffix | |
| IntegerConstant BigInt IntSuffix | |
| CharacterConstant [BigChar] CharPrefix | |
| StringLiteral [BigChar] CharPrefix | |
| Symbol Symbol | |
deriving Eq | |
type Whitespace = String | |
type Keyword = String | |
type Identifier = String | |
type BigFloat = SciRational | |
type BigInt = Integer | |
type BigChar = Integer | |
type Symbol = String | |
data FloatSuffix = Float0 | FloatF | FloatL deriving (Eq) | |
data IntSuffix = Int0 | IntL | IntU | IntUL deriving (Eq) | |
data CharPrefix = Char0 | CharL deriving (Eq) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment