Skip to content

Instantly share code, notes, and snippets.

@kseo
Created December 7, 2016 14:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kseo/ca4d5ec49bf40cedb60f4ed4e11acef2 to your computer and use it in GitHub Desktop.
Save kseo/ca4d5ec49bf40cedb60f4ed4e11acef2 to your computer and use it in GitHub Desktop.
DOM parser
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE TypeFamilies #-}
module Beard.DOM
( Element(..)
, Node(..)
, parseDOM
) where
import Control.Monad (when)
import Data.Foldable (fold)
import Data.List.NonEmpty (NonEmpty(..))
import qualified Data.List.NonEmpty as NE
import Data.Semigroup ((<>))
import Data.Text (Text)
import qualified Data.Text as T
import Data.HashMap.Strict (HashMap)
import qualified Data.HashMap.Strict as HMS
import qualified Data.Set as Set
import Text.HTML.TagSoup
import Text.Megaparsec
import Text.Megaparsec.Pos (SourcePos)
import Text.Megaparsec.Error (Dec, ShowToken)
type TextTag = Tag Text
type TextTagStream = [TextTag]
data Element = Element
{ elementName :: !Text
, elementAttrs :: !(HashMap Text Text)
, elementChildren :: [Node]
} deriving (Eq, Show)
data Node =
ElementNode Element
| TextNode Text
deriving (Eq, Show)
type DOMParser = Parsec Dec TextTagStream
instance Stream TextTagStream where
type Token TextTagStream = TextTag
uncons [] = Nothing
uncons (x:xs) = Just (x, xs)
updatePos = const updatePos'
updatePos'
:: Pos -- ^ Tab width
-> SourcePos -- ^ Current position
-> TextTag -- ^ Current token
-> (SourcePos, SourcePos) -- ^ Actual position and incremented position
updatePos' _ apos@(SourcePos n l c) token = (apos, npos)
where
u = unsafePos 1
npos = SourcePos n l (c <> u)
instance ShowToken TextTag where
showTokens = fold . NE.intersperse " " . fmap show
tagSatisfy :: (TextTag -> Bool) -> DOMParser TextTag
tagSatisfy f = token testTag Nothing
where testTag x = if f x
then Right x
else Left (Set.singleton (Tokens (x:|[])), Set.empty, Set.empty)
tagOpen = tagSatisfy isTagOpen <?> "tag open"
tagClose = tagSatisfy isTagClose <?> "tag close"
tagText = tagSatisfy isTagText <?> "tag text"
text :: DOMParser Text
text = fromTagText <$> tagText
element :: DOMParser Element
element = do
(TagOpen tagName attrs) <- tagOpen
children <- many node
closeTag@(TagClose tagName') <- tagClose
when (tagName /= tagName') $ fail $ "unexpected close tag: " ++ show closeTag
return $ Element tagName (HMS.fromList attrs) children
node :: DOMParser Node
node = ElementNode <$> element <|> TextNode <$> text
eraseComments :: TextTagStream -> TextTagStream
eraseComments = filter (not .isTagComment)
parseDOM :: Text -> Either (ParseError TextTag Dec) [Node]
parseDOM html = do
let tags = eraseComments (parseTags html)
parse (many node) "" tags
@kseo
Copy link
Author

kseo commented Jan 12, 2017

test

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment