Skip to content

Instantly share code, notes, and snippets.

@Heimdell
Last active October 17, 2022 08:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Heimdell/a06df902a0e0028ea5a25dfbfdb95af7 to your computer and use it in GitHub Desktop.
Save Heimdell/a06df902a0e0028ea5a25dfbfdb95af7 to your computer and use it in GitHub Desktop.
{-# OPTIONS_GHC -Wno-unrecognised-pragmas #-}
{-# HLINT ignore "Redundant multi-way if" #-}
{-# LANGUAGE ApplicativeDo #-}
{-# LANGUAGE BlockArguments #-}
{-# LANGUAGE DerivingStrategies #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE ImportQualifiedPost #-}
{-# LANGUAGE InstanceSigs #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE MultiWayIf #-}
{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE DeriveTraversable #-}
{- | Implementation of Thompson scanner (regexp).
Mostly taken verbatim from
https://www.cs.kent.ac.uk/people/staff/sjt/craft2e/regExp.pdf.
Higly unefficient in generation of DFA, slightly inefficient in matching
(we're calling `<=` on `Set Int` here multiple times per character).
-}
module Lexer where
import Control.Monad.Except (throwError)
import Control.Monad.State ( runState, evalState, get, put, State )
import Data.Map (Map)
import Data.Map qualified as Map
import Data.Maybe (listToMaybe)
import Data.Set (Set)
import Data.Set qualified as Set
import Data.String (IsString)
import GHC.Exts (IsString(fromString))
import Debug.Trace (traceShowM)
-- | Regular expression.
--
data Reg a
= Eps -- ^ Empty string
| Lit a -- ^ One char
| And (Reg a) (Reg a) -- ^ Sequence
| Or (Reg a) (Reg a) -- ^ Selection
| Star (Reg a) -- ^ Kleene star (0 or more repeats)
deriving stock (Eq, Ord, Functor, Foldable)
-- | Alphabet - set of all used characters ("literals").
--
literals :: Ord a => Reg a -> Set a
literals = foldMap Set.singleton
-- | ToString().
--
instance Show (Reg Char) where
show :: Reg Char -> String
show = \case
Eps -> ""
Lit c -> [c]
And reg reg' -> show reg <> show reg'
Or reg reg' -> "(" <> show reg <> "|" <> show reg' <> ")"
Star reg -> "(" <> show reg <> ")*"
-- | FromString().
--
instance IsString (Reg Char) where
fromString :: String -> Reg Char
fromString = foldr (And . Lit) Eps
-- | Indetermistic finite automaton.
--
data NFA c s = NFA
{ nfaStates :: Set s -- ^ All automaton states
, nfaMoves :: Set (Move c s) -- ^ All automaton moves
, nfaStart :: s -- ^ Starting (current) state
, nfaEnds :: Set s -- ^ All ending states
}
deriving stock (Eq, Ord, Show, Foldable)
-- | Automaton moves.
--
data Move c s
= Move s c s -- ^ Move on given literal (character)
| EMove s s -- ^ Non-labeled transition (thus nondeterministic)
deriving stock (Eq, Ord, Show, Functor, Foldable, Traversable)
-- | Closure of the first state for given alphabet.
--
trans :: (Ord s, Eq c) => NFA c s -> Set c -> Set s
trans nfa = foldr (oneTrans nfa) startset
where
startset = closure nfa (Set.singleton (nfaStart nfa))
-- | Closure on the state that happens if we accept this char in this state set.
--
oneTrans :: (Ord s, Eq c) => NFA c s -> c -> Set s -> Set s
oneTrans nfa c s = closure nfa (oneMove nfa c s)
-- | Closure.
--
-- NFA closure is a set of states, that is built from smaller /kernel/ set of
-- states. It is a set of states that is reachable with `EMove` transitions.
--
-- Mechanically, we contract all the states, `EMove`-connected to the /kernel/
-- into one set.
--
closure :: Ord s => NFA c s -> Set s -> Set s
closure NFA {nfaStart, nfaMoves, nfaStates, nfaEnds} =
limit \set -> Set.union set $ Set.fromList
[ s
| x <- Set.toList set -- for each kernel state,
, EMove y s <- Set.toList nfaMoves -- select `EMove`-s,
, y == x -- connected to that kernel state
]
-- | Continiously apply function to the argument until it stops changing it.
--
limit :: Eq a => (a -> a) -> a -> a
limit f a =
let b = f a
in
if a == b
then a
else limit f b
-- | Get a set of states that we reach from closure if we accept given literal.
--
oneMove :: (Ord s, Eq c) => NFA c s -> c -> Set s -> Set s
oneMove NFA {nfaStart, nfaMoves, nfaStates, nfaEnds} c x =
Set.fromList
[ s
| t <- Set.toList x -- each closure state
, Move z d s <- Set.toList nfaMoves -- find all moves
, z == t -- originating from that state
, d == c -- and accepting given symbol
]
-- | Convert regular expresion into `NFA`. We use `Int`-s as state names.
--
build :: Ord c => Reg c -> NFA c Int
build = \case
Eps -> NFA
{ nfaStates = Set.fromList [0, 1] -- begin, end
, nfaMoves = Set.singleton (EMove 0 1)
, nfaStart = 0
, nfaEnds = Set.singleton 1
}
Lit c -> NFA
{ nfaStates = Set.fromList [0, 1]
, nfaStart = 0
, nfaMoves = Set.singleton (Move 0 c 1) -- one move on given literal
, nfaEnds = Set.singleton 1
}
And reg reg' -> nfaAnd (build reg) (build reg') -- join graphs sequentally
Or reg reg' -> nfaOr (build reg) (build reg') -- join graphs alongside
Star reg -> nfaStar (build reg) -- make a loop
where
-- We add 2 states: starting and finishing, and connect both subgraphs to
-- them.
--
-- States are renamed such that 0 would not belong to any of them, and their
-- nodes do not intersect. The number line looks like that:
--
-- 0 [nfa1] [nfa2] (m1 + m2 + 1)
--
-- where m1, m2 are the sizes of nfa1 and nfa2.
--
nfaOr :: Ord c => NFA c Int -> NFA c Int -> NFA c Int
nfaOr nfa1 nfa2 = NFA
{ nfaStart = 0
, nfaStates = Set.unions [states1, states2, newStates]
, nfaMoves = Set.unions [moves1, moves2, newMoves]
, nfaEnds = Set.singleton (m1 + m2 + 1)
}
where
m1 = Set.size (nfaStates nfa1)
m2 = Set.size (nfaStates nfa2)
states1 = Set.map (+ 1) (nfaStates nfa1) -- shift right 1 point
states2 = Set.map (+ (m1 + 1)) (nfaStates nfa2) -- shr (1 + nfa1) pts
newStates = Set.fromList [0, m1 + m2 + 1]
moves1 = Set.map (fmap (+ 1)) (nfaMoves nfa1)
moves2 = Set.map (fmap (+ (m1 + 1))) (nfaMoves nfa2)
newMoves = Set.fromList
[
EMove 0 1 -- start -> nfa1
, EMove 0 (m1 + 1) -- start -> nfa2
, EMove m1 (m1 + m2 + 1) -- nfa1 -> finish
, EMove (m1 + m2) (m1 + m2 + 1) -- nfa2 -> finish
]
-- No states are added, we connect graphs in sequence.
--
-- States are renamed such that the starting node of @nfa2@ is the finishing
-- node of @nfa1@.
--
-- Number line:
--
-- [nfa1... joint]
-- [joint ...nfa2]
--
-- We also add no new moves, because graphs become correctly connected
-- naturally.
--
nfaAnd :: Ord c => NFA c Int -> NFA c Int -> NFA c Int
nfaAnd nfa1 nfa2 = NFA
{ nfaStart = 0
, nfaStates = states1 `Set.union` states2
, nfaMoves = moves1 `Set.union` moves2
, nfaEnds = Set.singleton (m1 + m2 - 2)
}
where
m1 = Set.size (nfaStates nfa1)
m2 = Set.size (nfaStates nfa2)
states1 = nfaStates nfa1
states2 = Set.map (+ (m1 - 1)) (nfaStates nfa2) -- shr (1 - nfa1) pts
-- ... so they can join
moves1 = nfaMoves nfa1
moves2 = Set.map (fmap (+ (m1 - 1))) (nfaMoves nfa2)
-- We add startin anf final point, and wire moves such that graph becomes
-- a loop around nfa1.
--
-- Number line
--
-- 0 [nfa1] (m1 + 1)
--
nfaStar :: Ord c => NFA c Int -> NFA c Int
nfaStar nfa1 = NFA
{ nfaStart = 0
, nfaStates = states1 `Set.union` newStates
, nfaMoves = moves1 `Set.union` newMoves
, nfaEnds = Set.singleton (m1 + 1)
}
where
m1 = Set.size (nfaStates nfa1)
newStates = Set.fromList [0, m1 + 1]
states1 = Set.map (+ 1) (nfaStates nfa1)
moves1 = Set.map (fmap (+ 1)) (nfaMoves nfa1)
newMoves = Set.fromList
[ EMove 0 1 -- start -> nfa1
, EMove 0 (m1 + 1) -- start -> finish (remove to get Kleene plus)
, EMove m1 1 -- nfa1 -> more nfa1
, EMove m1 (m1 + 1) -- nfa1 -> finish
]
-- | Remove (retract) all empty moves (EMoves).
--
-- Naturally, we use state sets instead of states, because all EMove-connected
-- states are indistinguishable, as nothing prevents you from using them
-- at any time.
--
-- So we pull all the states, connected to the kernels into one set, which
-- counts as state for DFA.
--
-- NFA (Set State) == DFA, basically.
--
-- We can view DFA states as EMove-connected groups of NFA states.
--
-- Is done by running `addStep` until no new info is gained.
--
determine
:: Ord c
=> NFA c Int -- ^ Automata
-> Set c -- ^ Alphabet
-> NFA c (Set Int)
determine nfa@NFA {nfaStart, nfaMoves, nfaStates, nfaEnds} alph =
limit (addStep nfa alph) start
where
start = NFA
{ nfaStart = startState
, nfaStates = Set.singleton startState
, nfaMoves = Set.empty
, nfaEnds = if
| nfaEnds `Set.disjoint` startState -> Set.empty
| otherwise -> Set.singleton startState
}
startState = closure nfa (Set.singleton nfaStart)
-- | Add all possible transitions of NFA into DFA.
--
addStep
:: forall c
. Ord c
=> NFA c Int -- ^ NFA
-> Set c -- ^ Alphabet
-> NFA c (Set Int) -- ^ DFA
-> NFA c (Set Int)
addStep nfa alph dfa = foldr add dfa (Set.toList (nfaStates dfa))
where
add :: Set Int -> NFA c (Set Int) -> NFA c (Set Int)
add s dfa = Set.foldr (addMove nfa s) dfa alph
-- | Add all NFA transitions on given literal (character) to the DFA.
--
addMove
:: Ord c
=> NFA c Int -- ^ NFA
-> Set Int -- ^ Current set of states == Current DFA state
-> c -- ^ Literal (character)
-> NFA c (Set Int) -- ^ DFA
-> NFA c (Set Int)
addMove
nfa@NFA { nfaEnds = term }
x
c
dfa@NFA { nfaStart, nfaMoves, nfaEnds, nfaStates}
=
NFA
{ nfaStart
, nfaMoves = moves'
, nfaStates = states'
, nfaEnds = ends'
}
where
states' = nfaStates `Set.union` Set.singleton new
moves' = nfaMoves `Set.union` Set.singleton (Move x c new)
ends' -- andd `new` to the end states, if intersects with NFA's end states
| term `Set.disjoint` new = nfaEnds
| otherwise = nfaEnds `Set.union` Set.singleton new
new = oneTrans nfa c x -- calculate state on `Move ... c ...` transition
-- | For some arcane reasons, my automatas are polluted by additional DFA state.
--
-- This DFA state is an empty set of NFA states and has all possible
-- connections with all other graph nodes.
--
-- For time being, instead of wrapping `closure`/`oneTrans` with `Maybe`
-- to cut off creation of null-state, I decided to filter it out afterwards.
--
-- TODO: prevent that from happening instead.
--
-- We remove null-set from states, and purge all moves from and to it.
--
filterNFA :: Ord s => NFA c (Set s) -> NFA c (Set s)
filterNFA NFA { nfaStart, nfaEnds, nfaMoves, nfaStates } =
NFA
{ nfaStart
, nfaEnds
, nfaMoves = Set.filter (not . any Set.null) nfaMoves
, nfaStates = Set.filter (not . Set.null) nfaStates
}
rename :: Ord c => NFA c (Set Int) -> NFA c Int
rename nfa = evalState (traverseNFA renumber nfa) (Map.empty, 0)
where
renumber :: Set Int -> State (Map (Set Int) Int, Int) Int
renumber set = do
(m, ptr) <- get
case Map.lookup set m of
Nothing -> do
put (Map.insert set ptr m, ptr + 1)
return ptr
Just any -> do
return any
traverseNFA :: (Applicative f, Ord t, Ord c) => (s -> f t) -> NFA c s -> f (NFA c t)
traverseNFA f NFA {nfaStart, nfaStates, nfaMoves, nfaEnds} = do
nfaStart <- f nfaStart
nfaEnds <- Set.fromList <$> traverse f (Set.toList nfaEnds)
nfaMoves <- Set.fromList <$> (traverse.traverse) f (Set.toList nfaMoves)
nfaStates <- Set.fromList <$> traverse f (Set.toList nfaStates)
return NFA {nfaStart, nfaStates, nfaMoves, nfaEnds}
-- | Run DFA on a string to check if string fits the rules.
--
match
:: (Ord s, Ord c) => NFA c s -- ^ DFA
-> [c] -- ^ Input
-> Either (Set c, [c]) ()
match dfa = \case
[] -- end of string should councide with automata bein in final state
| inEndState dfa -> return ()
| otherwise -> throwError (allCharsExpectedHere dfa, [])
(c : s) -> do
dfa' <- move dfa c s
match dfa' s
-- | Consume character, move to new state.
--
move
:: (Eq s, Ord c)
=> NFA c s -- ^ DFA
-> c -- ^ Next character
-> [c] -- ^ Rest of the string (for error reports)
-> Either (Set c, [c]) (NFA c s) -- ^ Error or new automata
move dfa c s = do
case
[ dfa {nfaStart = d} -- I reuse starting state as current
| Move s c' d <- Set.toList (nfaMoves dfa) -- all moves
, s == nfaStart dfa -- from current one
, c == c' -- on that char
]
of
[] -> throwError (allCharsExpectedHere dfa, c : s) -- no moves, die
nfa : _ -> return nfa -- no more than one move should exist
-- | Find all characters that are exprected in current state.
--
allCharsExpectedHere :: (Eq s, Ord c) => NFA c s -> Set c
allCharsExpectedHere dfa = Set.fromList
[ c
| Move s c _ <- Set.toList (nfaMoves dfa)
, s == nfaStart dfa
]
-- | Check if DFA is in its final state.
--
inEndState :: Ord s => NFA c s -> Bool
inEndState dfa = Set.member (nfaStart dfa) (nfaEnds dfa)
-- | Convert regiular expression to DFA.
--
dfa :: Reg Char -> NFA Char (Set Int)
dfa reg = filterNFA $ determine (build reg) (literals reg)
--------------------------------------------------------------------------------
auto :: NFA Char (Set Int)
auto = dfa $ Star (fromString "hello " `Or` fromString "hi ")
nfa :: NFA Char Int
nfa = build $ Star (fromString "hello " `Or` fromString "hi ")
testOk :: Either (Set Char, String) ()
testOk = match auto "hi hi hello hello hi "
testErr :: Either (Set Char, String) ()
testErr = match auto "hi hi hell hi hello hi"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment