Created
November 14, 2015 10:09
-
-
Save deflexor/247f97f4a59de0de5109 to your computer and use it in GitHub Desktop.
stack runghc Main.hs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{-# LANGUAGE OverloadedStrings #-} | |
module Main where | |
import Debug.Trace (trace, traceM, traceStack, traceShowM) | |
import Control.Applicative | |
import Control.Monad (unless, forM_, forM, void, mzero) | |
import Data.Char (isAlpha, isAlphaNum) | |
import qualified Text.Parsec as P | |
import Text.Parsec.Text | |
import qualified Data.Text as T | |
import Data.Text.IO as TIO | |
data Elem = Tag { tagName :: T.Text | |
, tagAttrs :: [(T.Text, T.Text)] | |
, tagChildren :: [Elem] } | |
| Comment T.Text | |
| TagText T.Text | |
| TagHdr (Maybe T.Text) | |
deriving (Eq, Read, Show) | |
nonClosedTags = ["link", "meta", "input", "img", "br"] | |
nametoken :: Parser T.Text | |
nametoken = do | |
part1 <- P.letter | |
part2 <- P.many (P.alphaNum <|> P.char '-' <|> P.char '.') | |
return $ T.cons part1 (T.pack part2) | |
tagattr :: Parser (T.Text, T.Text) | |
tagattr = do | |
name <- nametoken | |
val <- tagattrval | |
return (name, val) | |
tagattrval :: Parser T.Text | |
tagattrval = do | |
P.spaces | |
P.char '=' | |
P.choice [quotedval, nametoken] | |
quotedval :: Parser T.Text | |
quotedval = do | |
q <- P.satisfy (\c -> c == '"' || c == '\'') | |
val <- P.many $ P.satisfy (/= q) | |
P.skipMany (P.char q) | |
return $ T.pack val | |
closetag :: T.Text -> Parser () | |
closetag name = do | |
P.char '<' | |
P.char '/' P.<?> "closing tag indicator" | |
P.string $ T.unpack name | |
void $ P.char '>' | |
textual :: Parser Elem | |
textual = do | |
text <- P.many $ P.satisfy (/= '<') | |
if null text then mzero else return $ TagText $ T.pack text | |
childtags :: T.Text -> Parser [Elem] | |
childtags name = do | |
ct <- P.sepBy (tagorcomment <|> textual) P.spaces | |
P.spaces | |
closetag name | |
return ct | |
comment :: Parser Elem | |
comment = do | |
P.spaces | |
P.string "<!--" | |
ctext <- T.pack <$> P.manyTill P.anyChar (P.try (P.string "-->")) | |
return $ Comment ctext | |
tagelement :: Parser Elem | |
tagelement = do | |
P.spaces | |
P.char '<' | |
name <- nametoken | |
traceShowM name | |
P.spaces | |
attrs <- P.sepBy tagattr P.spaces | |
P.spaces | |
selfClose <- P.option False ((const True) <$> P.char '/') | |
P.char '>' | |
childtags <- if selfClose || name `elem` nonClosedTags then | |
return [] | |
else childtags name | |
return $ Tag name attrs childtags | |
tagorcomment :: Parser Elem | |
tagorcomment = (P.try tagelement) <|> (P.try comment) | |
markupParser :: Parser [Elem] | |
markupParser = do | |
tags <- P.many1 tagorcomment | |
P.spaces | |
return tags | |
parseMarkup :: T.Text -> Either P.ParseError [Elem] | |
parseMarkup = P.parse markupParser "" | |
testMarkup = "<html><head> \n\ | |
\ <meta http-equiv=\"Content-Type\" content=\"text/html; charset=windows-1251\"> \n\ | |
\ <link rel=\"shortcut icon\" href=\"/favicon.ico\"> \n\ | |
\ <!--[if lte IE 8]> \n\ | |
\ <link rel=\"stylesheet\" type=\"text/css\" href=\"/_all/top/ie8.css?nocache=20140901\"> \n\ | |
\ <![endif]--> \n\ | |
\ <script type=\"text/javascript\" src=\"/_/js/jquery.min.js\" ></script> \n\ | |
\ </head> \n\ | |
\ <body> \n\ | |
\ </body></html>" | |
testParser = parseMarkup testMarkup | |
main = print testParser |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment