Skip to content

Instantly share code, notes, and snippets.

@Lysxia
Last active September 5, 2022 14:07
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Lysxia/50141795d11e9d43f5094039a35412a3 to your computer and use it in GitHub Desktop.
Save Lysxia/50141795d11e9d43f5094039a35412a3 to your computer and use it in GitHub Desktop.
Applicative HTML Parser
{-# LANGUAGE GADTs, DeriveFunctor, LambdaCase #-}
import Data.Functor.Product
import Data.Void
import Control.Applicative
type Tag = String
type Attr = String
data HTML
= Tag Tag [Attr] [HTML]
| Txt String
-- Example
example :: HTML
example =
Tag "html" ["lang=en"]
[ Tag "head" []
[ Tag "title" [] [ Txt "Hello World!" ]
]
, Tag "body" []
[ Tag "h1" [] [ Txt "Hi" ]
, Tag "p" ["id=lorem"] [ Txt "lorem ipsum" ]
]
]
parser :: HTMLParser [String]
parser = sequenceA
[getTitle, getH1, getLorem]
getTitle :: HTMLParser String
getTitle = onElem "title" (onChild getTxt)
getH1 :: HTMLParser String
getH1 = onElem "body" (onElem "h1" (onChild getTxt))
getLorem :: HTMLParser String
getLorem = onElem "body" (onAttr "id=lorem" (onChild getTxt))
main :: IO ()
main = print (parseHTML parser example)
-- Implementation
onElem :: Tag -> HTMLParser a -> HTMLParser a
onElem t = Expect . Find (HasTag t)
onAttr :: String -> HTMLParser a -> HTMLParser a
onAttr a = Expect . Find (HasAttr a)
onChild :: HTMLParser a -> HTMLParser a
onChild = OnChild
getTxt :: HTMLParser String
getTxt = Expect (OnElt $ \case Txt t -> Just t ; Tag _ _ _ -> Nothing)
data HTMLParser a where
And :: Day HTMLParser HTMLParser a -> HTMLParser a
-- ^ Conjunction of parsers (all must succeed and their results are combined)
Find :: Selector -> HTMLParser a -> HTMLParser (Maybe a)
-- ^ Find matching element anywhere and apply subparser to it.
-- Return Nothing if no match found.
Expect :: HTMLParser (Maybe a) -> HTMLParser a
-- ^ Swallow the Maybe error in Find
OnChild :: HTMLParser a -> HTMLParser a
-- ^ Try applying subparser to children
Check :: Selector -> HTMLParser a -> HTMLParser a
-- ^ Match current element and apply subparser to it
Fail :: HTMLParser a
OnElt :: (HTML -> a) -> HTMLParser a
Done :: a -> HTMLParser a
data Selector
= HasTag Tag
| HasAttr Attr
| (:&&:) Selector Selector
| Not Selector
evalSel :: Selector -> HTML -> Bool
evalSel (HasTag t) (Tag t' _ _) = t == t'
evalSel (HasTag _) (Txt _) = False
evalSel (HasAttr x) (Tag _ xs _) = x `elem` xs
evalSel (HasAttr x) (Txt _) = False
evalSel (s :&&: s') h = evalSel s h && evalSel s' h
evalSel (Not s) h = not (evalSel s h)
data Day f g a where
LiftA2 :: (b -> c -> a) -> f b -> g c -> Day f g a
data Bind f g a where
(:>>=:) :: f a -> (a -> g b) -> Bind f g b
instance Functor HTMLParser where
fmap = liftA
instance Applicative HTMLParser where
pure = Done
liftA2 f (Done x) (Done y) = Done (f x y)
liftA2 f Fail _ = Fail
liftA2 _ _ Fail = Fail
liftA2 f x y = And (LiftA2 f x y)
-- Split a parser into a subparser to parse the children of an element and a continuation to parse its siblings.
split :: HTML -> HTMLParser a -> Bind HTMLParser HTMLParser a
split h Fail = Fail :>>=: absurd
split h (OnElt f) = Done () :>>=: \() -> Done (f h)
split h (Done x) = Done () :>>=: \() -> Done x
split h p0@(Find f p)
| evalSel f h = case split h p of
p :>>=: k -> p :>>=: (fmap Just . k)
| otherwise = p0 :>>=: \case Nothing -> p0 ; Just x -> Done (Just x)
split h p0@(Check f p)
| evalSel f h = split h p
| otherwise = Fail :>>=: absurd
split h (OnChild p) = p :>>=: Done
split h (Expect p) = case split h p of
p :>>=: k -> p :>>=: (Expect . k)
split h (And (LiftA2 f l r)) = case (split h l, split h r) of
(l :>>=: kl, r :>>=: kr) -> liftA2 (,) l r :>>=: \(x, y) -> liftA2 f (kl x) (kr y)
endElement :: HTMLParser a -> Maybe a
endElement (Done x) = Just x
endElement Fail = Nothing
endElement (Find _ _) = Just Nothing
endElement (Check _ _) = Nothing
endElement (Expect p) = case endElement p of
Just (Just x) -> Just x
_ -> Nothing
endElement (OnChild p) = Nothing
endElement (And (LiftA2 f p q)) = liftA2 f (endElement p) (endElement q)
endElement (OnElt _) = error "should not happen"
parseHTML' :: HTMLParser a -> HTML -> HTMLParser a
parseHTML' p h = case split h p of
Done x :>>=: k -> k x
p :>>=: k -> case h of
Tag _ _ hs -> parseHTMLs p hs k
Txt _ -> parseHTMLs p [] k
parseHTMLs :: HTMLParser a -> [HTML] -> (a -> HTMLParser b) -> HTMLParser b
parseHTMLs p [] k = case endElement p of
Nothing -> Fail
Just x -> k x
parseHTMLs p (h : hs) k = case parseHTML' p h of
Fail -> Fail
p -> parseHTMLs p hs k
parseHTML :: HTMLParser a -> HTML -> Maybe a
parseHTML p h = endElement (parseHTML' p h)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment