Skip to content

Instantly share code, notes, and snippets.

@jgm
Created November 13, 2012 16:17
Show Gist options
  • Save jgm/4066710 to your computer and use it in GitHub Desktop.
Save jgm/4066710 to your computer and use it in GitHub Desktop.
{-# LANGUAGE OverloadedStrings #-}
{-
PERFORMANCE
Faster than pandoc and with better memory usage.
Both compiled with -O2. Latest dev pandoc 1.10.
Benchmarked on 178K markdown file.
Markdown pandoc -f markdown_strict
max bytes residency 2045484 5752636
total cpu time 0.293s 0.542s
Still not up with the C implementations.
TODO
_ explore a line-based approach?
maybe don't need parser combinators, except for the scanners?
QUESTIONS
* nested quotes in link title? seems silly, but some impls do?
* limit html blocks to list of html block tags?
* markdown=1 attribute?
* allow blank lines not to separate two code blocks?
Markdown.pl fuses them. is this desirable?
CURRENTLY THEY ARE TREATED AS SEPARATE.
* do we want the linebreak at end of code block?
YES. TENTATIVELY. FOR CONFORMITY.
* in numbered lists, do we store list style?
YES.
* in bullet lists, does new style start new list?
YES. TENTATIVELY.
* is a 1-item list tight or loose?
TIGHT.
* does a tight list end once we start getting loose items?
YES - NOT YET VICE VERSA
* two blank lines end a list?
YES
* two blockquotes w blank line between
YES
* store entities as chars or entities?
CURRENTLY AS ENTITIES
* how exactly do html blocks work?
* char encoding?
* tab handling?
* should we retain user line breaks?
-}
module Main where
import qualified Data.Map as M
import Data.List (partition)
import Data.Char (isAscii, isSpace, isPunctuation, isSymbol)
import Network.URI (parseURI, URI(..), isAllowedInURI, escapeURIString)
import Data.Monoid ((<>))
import Data.Foldable (foldMap, toList)
import Control.Monad
import Control.Applicative hiding ((<|>),many,optional,empty)
import Text.Parsec hiding (sepBy, sepBy1)
import Text.Parsec.Text
import Data.Sequence (Seq, singleton, empty, (<|))
import qualified Data.Sequence as Seq
import qualified Data.Text as T
import qualified Data.Text.IO as T
import Data.Text ( Text )
-- for HTML rendering
import qualified Text.Blaze.XHtml5 as H
import qualified Text.Blaze.Html5.Attributes as A
import Text.Blaze.Html.Renderer.Utf8 (renderHtml)
import qualified Text.Blaze.Html.Renderer.Text as BT
import Text.Blaze.Html hiding(contents)
import qualified Data.ByteString.Lazy as BL
-- for main loop
import System.Environment (getArgs)
-- for debugging
import Debug.Trace
-- Replacement for Parsec's 'sepBy1', which does not have the @try@
-- and so does not behave as I want it to.
sepBy1 :: P a -> P b -> P [a]
sepBy1 p sep = do
first <- p
rest <- many $ try $ sep >> p
return (first:rest)
sepBy :: P a -> P b -> P [a]
sepBy p sep = sepBy1 p sep <|> return []
-- Structured representation of a document.
data CodeAttr = CodeAttr { codeLang :: Maybe Text }
deriving Show
data Block = Para Inlines
| Header Int Inlines
| HRule
| Blockquote Blocks
| List Bool ListType [Blocks]
| CodeBlock CodeAttr Text
| HtmlBlock Text
deriving Show
-- TODO raw html
type Blocks = Seq Block
data Inline = Str Text
| Space
| SoftBreak
| LineBreak
| Emph Inlines
| Strong Inlines
| Code Text
| Link Inlines Text Text
| Image Inlines Text Text
| Entity Text
| RawHtml Text
| Markdown Text
| Err Text Text
deriving Show
type Inlines = Seq Inline
data NumWrapper = PeriodFollowing | ParenFollowing | ParensAround
deriving (Eq,Show)
data ListType = Bullet Char | Numbered NumWrapper Int deriving Show
-- Defining the parser.
type ReferenceMap = M.Map Text (Text, Text)
addLinkReference :: Text -> (Text, Text) -> P ()
addLinkReference key (url,tit) = updateState $ \st ->
st{ linkReferences = M.insert (T.toUpper key) (url,tit) (linkReferences st) }
lookupLinkReference :: Text -> P (Maybe (Text, Text))
lookupLinkReference key = do
refmap <- linkReferences <$> getState
return $ M.lookup (T.toUpper key) refmap
data ParserState = ParserState{
beginLineScanners :: [P ()]
, beginBlockScanners :: [P ()]
, linkReferences :: ReferenceMap
}
startingState :: ParserState
startingState = ParserState{
beginLineScanners = []
, beginBlockScanners = []
, linkReferences = M.empty
}
type P = GenParser ParserState
parseBlocks :: Text -> Either ParseError (Blocks, ReferenceMap)
parseBlocks = runP withRefs startingState "input"
where withRefs = do x <- pDoc
y <- linkReferences <$> getState
return (x,y)
-- Functions to maintain two stacks of scanners:
--
-- * The "begin line scanners" are run at the beginning
-- of a line that continues an existing block.
-- * The "begin block scanners" are run at the beginning
-- of a new block.
withBeginLineScanner :: P () -> P a -> P a
withBeginLineScanner scanner p = try $ do
updateState $ \st -> st{ beginLineScanners =
beginLineScanners st ++ [scanner] }
result <- p
updateState $ \st -> st{ beginLineScanners =
case reverse (beginLineScanners st) of
(_:xs) -> reverse xs
[] -> [] }
return result
withBeginBlockScanner :: P () -> P a -> P a
withBeginBlockScanner scanner p = try $ do
updateState $ \st -> st{ beginBlockScanners =
beginBlockScanners st ++ [scanner] }
result <- p
updateState $ \st -> st{ beginBlockScanners =
case reverse (beginBlockScanners st) of
(_:xs) -> reverse xs
[] -> [] }
return result
-- When we begin parsing a block container, such as a blockquote
-- or list item, we push new scanners onto the begin line and
-- begin block stacks. We then parse blocks until we can't
-- parse any more. At that point we pop the scanners we added
-- and return the result.
--
-- Note: the scanners are applied in the order they were added
-- to the stack. We apply the first one that was added, then
-- the second, etc.
--
-- After a newline, we try the begin line parsers. If they succeed,
-- the next line is considered part of the current block.
-- Otherwise, we consider the block closed and try to parse
-- another block.
-- | Parse one or more blocks with new begin line and begin block
-- scanners.
pBlocks :: P () -> P () -> P Blocks
pBlocks beginLineScanner beginBlockScanner =
withBeginLineScanner beginLineScanner $
withBeginBlockScanner beginBlockScanner $
msum <$> (pBlock `sepBy1` pBlockSep)
-- | Parse optional blanklines separating blocks, plus whatever
-- the begin block scanners require at the beginning of a new block.
pBlockSep :: P Int
pBlockSep = try $ do
num <- length <$> many (try $ pBeginBlock >> pBlankline)
pBeginBlock
return num
-- | Parses line beginning for new block.
pBeginBlock :: P ()
pBeginBlock = try $ getState >>= sequence_ . beginBlockScanners
pBeginLine :: P ()
pBeginLine = try $ getState >>= sequence_ . beginLineScanners
-- Utility parsers.
-- | Applies a parser and returns the raw text that was parsed,
-- along with the value produced by the parser.
withRaw :: P a -> P (a, Text)
withRaw parser = do
pos1 <- getPosition
inp <- getInput
result <- parser
pos2 <- getPosition
let (l1,c1) = (sourceLine pos1, sourceColumn pos1)
let (l2,c2) = (sourceLine pos2, sourceColumn pos2)
let inplines = take ((l2 - l1) + 1) $ T.lines inp
let raw = case inplines of
[] -> error "raw: inplines is null" -- shouldn't happen
[l] -> T.take (c2 - c1) l
ls -> T.unlines (init ls) <> T.take (c2 - 1) (last ls)
return (result, raw)
pEscapedChar :: P Char
pEscapedChar = try $ char '\\' *> satisfy isEscapable
isEscapable :: Char -> Bool
isEscapable c = isSymbol c || isPunctuation c
-- parses a character satisfying the predicate, but understands escaped
-- symbols
pSatisfy :: (Char -> Bool) -> P Char
pSatisfy p =
satisfy (\c -> c /= '\\' && p c)
<|> try (char '\\' *> satisfy (\c -> isEscapable c && p c))
pAnyChar :: P Char
pAnyChar = pSatisfy (const True)
pNonspaceChar :: P Char
pNonspaceChar = pSatisfy (`notElem` " \t\r\n")
pBlankline :: P ()
pBlankline = try $ skipMany pSpaceChar <* newline
pBlockquoteStart :: P ()
pBlockquoteStart = try $ pNonindentSpaces *> char '>' *> optional (char ' ')
pIndentSpace :: P ()
pIndentSpace = try (() <$ count 4 (char ' '))
<|> try (pNonindentSpaces >> char '\t' >> return ())
pNonindentSpaces :: P String
pNonindentSpaces = do
option "" $ do
char ' '
option " " $ do
char ' '
option " " $ do
char ' '
return " "
pSpaceChar :: P Char
pSpaceChar = oneOf " \t"
pLine :: P Text
pLine = T.pack <$> ( manyTill (satisfy (/='\n')) newline
<|> many1 anyChar -- for last line w/o newline
)
pCodeFenceLine :: P (String, CodeAttr)
pCodeFenceLine = try $ do
c <- oneOf "`~"
count 2 (char c)
extra <- many (char '`')
attr <- parseCodeAttributes <$> pLine
return (c:c:c:extra, attr)
parseCodeAttributes :: Text -> CodeAttr
parseCodeAttributes t = CodeAttr { codeLang = lang }
where lang = case T.words (T.strip t) of
[] -> Nothing
(l:_) -> Just l
-- | Parses one of a list of strings (tried in order).
oneOfStrings :: [String] -> P String
oneOfStrings [] = fail "no strings"
oneOfStrings strs = do
c <- anyChar
let strs' = [xs | (x:xs) <- strs, x == c]
case strs' of
[] -> fail "not found"
z | "" `elem` z -> return [c]
| otherwise -> (c:) `fmap` oneOfStrings strs'
-- | Parses a URI. Returns pair of original and URI-escaped version.
uri :: P (Text, Text)
uri = try $ do
let protocols = [ "http:", "https:", "ftp:", "file:", "mailto:",
"news:", "telnet:" ]
lookAhead $ oneOfStrings protocols
-- Scan non-ascii characters and ascii characters allowed in a URI.
-- We allow punctuation except when followed by a space, since
-- we don't want the trailing '.' in 'http://google.com.'
let innerPunct = try $ pSatisfy isPunctuation <*
notFollowedBy (newline <|> pSpaceChar)
let uriChar = innerPunct <|>
pSatisfy (\c -> not (isPunctuation c) &&
(not (isAscii c) || isAllowedInURI c))
-- We want to allow
-- http://en.wikipedia.org/wiki/State_of_emergency_(disambiguation)
-- as a URL, while NOT picking up the closing paren in
-- (http://wikipedia.org)
-- So we include balanced parens in the URL.
let inParens = try $ do char '('
res <- many uriChar
char ')'
return $ '(' : res ++ ")"
str <- concat <$> many1 (inParens <|> count 1 (innerPunct <|> uriChar))
str' <- option str $ char '/' >> return (str ++ "/")
-- now see if they amount to an absolute URI
let escapeURI = escapeURIString (not . isSpace)
case parseURI (escapeURI str') of
Just uri' -> if uriScheme uri' `elem` protocols
then return (T.pack str', T.pack $ show uri')
else fail "not a URI"
Nothing -> fail "not a URI"
data HtmlTagType = Opening Text | Closing Text | SelfClosing Text
-- returns name of tag needed to close, and whole tag
pHtmlTag :: P (HtmlTagType, Text)
pHtmlTag = try $ do
char '<'
-- do not end the tag with a > character in a quoted attribute.
closing <- (char '/' >> return True) <|> return False
tagname <- T.toLower . T.pack <$> many1 alphaNum
lookAhead $ oneOf " \t\r\n/>"
chunks <- manyTill (pQuoted '\'' <|> pQuoted '"'
<|> many1 (noneOf "\"'<>")) (char '>')
let body = concat chunks
let tagtype = if closing
then Closing tagname
else case reverse body of
('/':_) -> SelfClosing tagname
_ -> Opening tagname
return (tagtype, T.pack ('<' : ['/' | closing]) <> tagname <> T.pack body
<> ">")
pQuoted :: Char -> P String
pQuoted c = try $ do
char c
contents <- manyTill (satisfy (/= c)) (char c)
return (c : contents ++ [c])
-- Block-level parsers.
pDoc :: P Blocks
pDoc = pBlocks (return ()) (return ()) <* skipMany pBlankline <* eof
pBlock :: P Blocks
pBlock = pBlockquote
<|> pAtxHeader
<|> pHrule
<|> pList
<|> pCodeFence
<|> pCodeBlock
<|> pReference
<|> pHtmlBlock
<|> pPara
pInBalancedTags :: P Text
pInBalancedTags = try $ do
(tagtype, opener) <- pHtmlTag
case tagtype of
SelfClosing _ -> return opener
Closing _ -> mzero
Opening name -> (opener <>) <$> getRest name
where getRest name = try $ do
nontag <- T.pack <$> many (satisfy (/='<'))
(tagtype', x') <- lookAhead pHtmlTag
case tagtype' of
Closing n | n == name -> do
_ <- pHtmlTag
return $ nontag <> x'
Opening n | n == name -> do
chunk <- pInBalancedTags
rest <- getRest name
return $ nontag <> chunk <> rest
_ -> do
_ <- pHtmlTag
rest <- getRest name
return $ (nontag <> x') <> rest
pHtmlBlock :: P Blocks
pHtmlBlock = singleton . HtmlBlock <$>
((pInBalancedTags <|> pHtmlComment) <* skipMany pBlankline)
pHtmlComment :: P Text
pHtmlComment = try $ do
string "<!--"
rest <- manyTill anyChar (try $ string "-->")
return $ "<!--" <> T.pack rest <> "-->"
pReference :: P Blocks
pReference = try $ do
pNonindentSpaces
lab <- pLinkLabel
char ':'
pSpnl
url <- pLinkUrl
tit <- option T.empty $ try $ pSpnl >> pLinkTitle
pBlankline
addLinkReference lab (url,tit)
return empty
pLinkLabel :: P Text
pLinkLabel = try $ char '[' *>
(T.concat <$> (manyTill (regChunk <|> bracketed <|> codeChunk) (char ']')))
where regChunk = T.pack <$> many1 (pSatisfy (`notElem` "`[]"))
codeChunk = snd <$> withRaw pCode
bracketed = inBrackets <$> pLinkLabel
inBrackets t = "[" <> t <> "]"
pLinkUrl :: P Text
pLinkUrl = try $ do
inPointy <- (char '<' >> return True) <|> return False
T.pack <$> if inPointy
then manyTill (pSatisfy (`notElem` "\r\n>")) (char '>')
else concat <$> many (regChunk <|> parenChunk)
where regChunk = many1 (pSatisfy (`notElem` " \t\r\n()"))
parenChunk = inParens . concat <$>
(char '(' *>
manyTill (regChunk <|> parenChunk) (char ')'))
inParens x = '(' : x ++ ")"
pLinkTitle :: P Text
pLinkTitle = T.pack <$> (pLinkTitleDQ <|> pLinkTitleSQ <|> pLinkTitleP)
where pLinkTitleDQ = try $ char '"' *> manyTill pAnyChar (char '"')
pLinkTitleSQ = try $ char '\'' *> manyTill pAnyChar (char '\'')
pLinkTitleP = try $ char '(' *> manyTill pAnyChar (char ')')
pHruleLine :: P ()
pHruleLine = do
pNonindentSpaces
c <- oneOf "*-_"
count 2 (try $ pSp >> char c)
skipMany $ try $ pSp >> char c
pSp
lookAhead newline
return ()
pHrule :: P Blocks
pHrule = singleton HRule <$ try pHruleLine
pSp :: P ()
pSp = skipMany pSpaceChar
pSpnl :: P ()
pSpnl = try $ pSp *> optional (newline *> pSp)
joinLines :: [Text] -> Text
joinLines = T.intercalate (T.pack "\n")
-- handles paragraphs and setext headers and hrules
pPara :: P Blocks
pPara = processLines . filter (not . T.null)
<$> withBeginLineScanner paraLine (pLine `sepBy1` pBeginLine)
where paraLine = notFollowedBy pBlankline
>> notFollowedBy pIndentSpace
>> notFollowedBy pBlockquoteStart
>> notFollowedBy pAtxHeaderStart
>> notFollowedBy (try $ pSp >> pListMarker)
>> notFollowedBy pCodeFenceLine
markdown = singleton . Markdown . T.strip
processLines [] = empty
processLines ws =
case break isSpecialLine ws of
(xs, []) -> singleton $ Para $ markdown $ joinLines xs
(xs,(y:ys))
| isSetextLine y ->
case reverse xs of
[] -> Header (setextLevel y) empty
<| processLines ys
[z] -> Header (setextLevel y) (markdown z)
<| processLines ys
(z:zs) -> Para (markdown $ joinLines $ reverse zs)
<| Header (setextLevel y) (markdown z)
<| processLines ys
| isHruleLine y ->
case xs of
[] -> HRule
<| processLines ys
_ -> Para (markdown $ joinLines xs)
<| HRule
<| processLines ys
| otherwise -> error "Should not happen"
isSetext1Line x = not (T.null x) && T.all (=='=') x
isSetext2Line x = not (T.null x) && T.all (=='-') x
isSetextLine x = isSetext1Line x || isSetext2Line x
setextLevel x = if isSetext1Line x then 1 else 2
isHruleLine x = case runP pHruleLine startingState "" x of
Right _ -> True
Left _ -> False
isSpecialLine x = isSetextLine x || isHruleLine x
pBlockquote :: P Blocks
pBlockquote = singleton . Blockquote <$>
try (pBlockquoteStart >> pBlocks (optional pBlockquoteStart) pBlockquoteStart)
pCodeFence :: P Blocks
pCodeFence = try $ do
(fence, attr) <- pCodeFenceLine
lns <- manyTill pLine (try $ string fence >> pLine)
return $ singleton $ CodeBlock attr $ T.unlines lns
pCodeBlock :: P Blocks
pCodeBlock = try $ do
pIndentSpace
singleton . CodeBlock CodeAttr{ codeLang = Nothing } . T.unlines
<$> withBeginLineScanner pIndentSpace (pLine `sepBy1` pBeginLine)
pList :: P Blocks
pList = try $ do
sps <- pNonindentSpaces
col <- sourceColumn <$> getPosition
listType <- pListMarker
col' <- sourceColumn <$> getPosition
let sublistIndent = () <$ count (col' - col - 1) (char ' ')
let starter = try $ string sps *> pListStart listType
let listItemBlocks =
pBlocks (notFollowedBy
(try $ string sps >> sublistIndent >> pSp >> pListMarker))
(try $ (string sps >> sublistIndent)
<|> lookAhead (try $ pBlankline *> notFollowedBy pBlankline))
first <- listItemBlocks
isTight <- (== 0) <$> (lookAhead pBlockSep <|> return 0)
let listItem = try $ do
num <- pBlockSep
when (isTight && num > 0) $
fail "Change in tightness ends list"
starter
blocks <- listItemBlocks
return blocks
rest <- many listItem
let isTight' = if null rest then True else isTight
return $ singleton $ List isTight' listType (first:rest)
pListMarker :: P ListType
pListMarker = pBullet <|> pListNumber
pBullet :: P ListType
pBullet = Bullet <$> oneOf "*+-" <* (pSpaceChar <|> lookAhead newline)
pListNumber :: P ListType
pListNumber =
(pListNumberDig <|> pListNumberPar) <* (pSpaceChar <|> lookAhead newline)
where pListNumberDig = try $ do
num <- read <$> many1 digit
wrap <- PeriodFollowing <$ char '.'
<|> ParenFollowing <$ char ')'
return $ Numbered wrap num
pListNumberPar = try $ do
char '('
num <- read <$> many1 digit
char ')'
return $ Numbered ParensAround num
pListStart :: ListType -> P ()
pListStart (Bullet c) = () <$ notFollowedBy pHruleLine <* char c <* pSpaceChar
pListStart (Numbered w _) = try $ do
marker <- pListNumber
case marker of
Numbered w' _ | w == w' -> return ()
_ -> fail "Change in list style"
pSetextHeaderLine :: P Int
pSetextHeaderLine = try $ do
n <- (1 <$ skipMany1 (char '=')) <|> (2 <$ skipMany1 (char '-'))
pBlankline
return n
pAtxHeaderStart :: P Int
pAtxHeaderStart = length <$> try (many1 (char '#') <* pSpaceChar)
pAtxHeader :: P Blocks
pAtxHeader = do
lev <- pAtxHeaderStart
singleton . Header lev . singleton . Markdown . stripClosingHashes <$> pLine
where stripClosingHashes = T.strip . T.dropAround (=='#') . T.strip
parseInlines :: ReferenceMap -> Text -> Inlines
parseInlines refmap t =
case runP (msum <$> many pInline <* eof) st "" t of
Left e -> singleton $ Err t' (T.pack $ show e)
Right r -> r
where st = startingState{ linkReferences = refmap }
t' = T.strip t
pInline :: P Inlines
pInline = pSpace
<|> pUri
<|> pStr
<|> pStrong '*'
<|> pStrong '_'
<|> pEmph '*'
<|> pEmph '_'
<|> pLink
<|> pImage
<|> pCode
<|> pEntity
<|> pAutolink
<|> pRawHtml
<|> pSym
pSpace :: P Inlines
pSpace = singleton <$> (pSpaceSpace <|> pSpaceNewline)
where pSpaceSpace = try $ pSpaceChar >>
(pSpaceNewline <|> pSpaceLB <|> return Space)
pSpaceLB = try $ pSpaceChar >> pSp >>
((pSpaceNewline >> return LineBreak) <|> return Space)
pSpaceNewline = newline >> pSp >> return SoftBreak
pStr :: P Inlines
pStr = do
first <- alphaNum
rest <- many $ alphaNum <|> (try $ char '_' <* lookAhead alphaNum)
return $ singleton . Str . T.pack $ first:rest
pSym :: P Inlines
pSym = singleton . Str . T.singleton <$> pNonspaceChar
pUri :: P Inlines
pUri = do
(orig,escaped) <- uri
return $ singleton $ Link (singleton $ Str orig) escaped (T.empty)
pEmph :: Char -> P Inlines
pEmph c = try $ do
char c
notFollowedBy pSpaceChar
contents <- msum <$>
many1 ( (try $ notFollowedBy (char c) >> pInline) <|> pStrong c )
(char c >> return (singleton (Emph contents)))
<|> return (Str (T.pack "*") <| contents)
pStrong :: Char -> P Inlines
pStrong c = try $ do
let marker = try $ char c >> char c
marker
notFollowedBy pSpaceChar
contents <- msum <$> many1 (try $ notFollowedBy marker >> pInline)
marker
return (singleton (Strong contents))
pCode :: P Inlines
pCode = try $ do
numticks <- length <$> many1 (char '`')
pSp
let end = try $ count numticks (char '`') *> notFollowedBy (char '`')
let nonBacktickSpan = many1 (noneOf "`")
let backtickSpan = many1 (char '`')
singleton . Code . T.strip . T.pack . concat
<$> manyTill (nonBacktickSpan <|> backtickSpan) end
pLink :: P Inlines
pLink = try $ do
lab <- pLinkLabel
refmap <- linkReferences <$> getState
let lab' = parseInlines refmap lab
pInlineLink lab' <|> pReferenceLink lab lab'
<|> return (singleton (Str "[") <> lab' <> singleton (Str "]"))
pInlineLink :: Inlines -> P Inlines
pInlineLink lab = try $ do
char '('
pSp
url <- pLinkUrl
tit <- option "" $ try $ pSpnl *> pLinkTitle <* pSp
char ')'
return $ singleton $ Link lab url tit
pReferenceLink :: Text -> Inlines -> P Inlines
pReferenceLink rawlab lab = try $ do
ref <- option rawlab $ try $ pSpnl >> pLinkLabel
let ref' = if T.null ref then rawlab else ref
lookupResult <- lookupLinkReference ref'
case lookupResult of
Just (url,tit) -> return $ singleton $ Link lab url tit
Nothing -> fail "Reference not found"
pImage :: P Inlines
pImage = try $ do
char '!'
let linkToImage (Link lab url tit) = Image lab url tit
linkToImage x = x
fmap linkToImage <$> pLink
pRawHtml :: P Inlines
pRawHtml = singleton . RawHtml . snd <$> pHtmlTag
pEntity :: P Inlines
pEntity = try $ do
char '&'
res <- pCharEntity <|> pDecEntity <|> pHexEntity
char ';'
return $ singleton $ Entity $ T.pack $ '&':res ++ ";"
pCharEntity :: P [Char]
pCharEntity = many1 letter
pDecEntity :: P [Char]
pDecEntity = try $ do
char '#'
res <- many1 digit
return $ '#':res
pHexEntity :: P [Char]
pHexEntity = try $ do
char '#'
x <- oneOf "Xx"
res <- many1 hexDigit
return $ '#':x:res
pAutolink :: P Inlines
pAutolink = try $ char '<' *> pUri <* char '>'
-- blocks
parseMarkdown :: Text -> Either ParseError Blocks
parseMarkdown t =
case parseBlocks t of
Left err -> Left err
Right (bls, refmap) -> Right $ processBlocks refmap bls
processBlocks :: ReferenceMap -> Blocks -> Blocks
processBlocks refmap = fmap processBl
where processInlines = foldMap processInline
processInline (Markdown t) = parseInlines refmap t
processInline x = singleton x
processBl :: Block -> Block
processBl bl =
case bl of
Para ils -> Para $ processInlines ils
Header n ils -> Header n $ processInlines ils
Blockquote bls -> Blockquote $ processBlocks refmap bls
List tight listType items ->
List tight listType $ map (processBlocks refmap) items
x -> x
renderBlocks :: Blocks -> Html
renderBlocks = foldMap renderBlock
where renderBlock :: Block -> Html
renderBlock (Header n ils)
| n >= 1 && n <= 5 = withNl $ ([H.h1,H.h2,H.h3,H.h4,H.h5] !! (n - 1))
$ renderInlines ils
| otherwise = withNl $ H.p (renderInlines ils)
renderBlock (Para ils) = withNl $ H.p (renderInlines ils)
renderBlock (HRule) = withNl $ H.hr
renderBlock (Blockquote bs) =
withNl $ H.blockquote $ "\n" <> renderBlocks bs
renderBlock (CodeBlock attr t) =
withNl $ case codeLang attr of
Nothing -> base
Just lang -> base ! A.class_ (toValue lang)
where base = H.pre $ H.code $ toHtml t
renderBlock (List tight (Bullet _) items) =
withNl $ H.ul $ "\n" <> mapM_ (li tight) items
renderBlock (List tight (Numbered _ n) items) =
if n == 1 then base else base ! A.start (toValue n)
where base = withNl $ H.ol $ "\n" <> mapM_ (li tight) items
renderBlock (HtmlBlock raw) = withNl $ H.preEscapedToMarkup raw
withNl x = x <> "\n"
li :: Bool -> Blocks -> Html
li tight bs =
if tight
then case toList bs of
[Para zs] -> H.li (renderInlines zs) <> "\n"
[Para zs, List{}] -> H.li (renderInlines zs <> "\n" <>
renderBlocks (Seq.drop 1 bs)) <> "\n"
_ -> toLi bs
else toLi bs
where toLi x = (H.li $ renderBlocks x) <> "\n"
renderInlines :: Inlines -> Html
renderInlines = foldMap renderInline
where renderInline :: Inline -> Html
renderInline (Str t) = toHtml t
renderInline Space = " "
renderInline SoftBreak = "\n"
renderInline LineBreak = H.br <> "\n"
renderInline (Emph ils) = H.em $ renderInlines ils
renderInline (Strong ils) = H.strong $ renderInlines ils
renderInline (Code t) = H.code $ toHtml t
renderInline (Link ils url tit) =
if T.null tit then base else base ! A.title (toValue tit)
where base = H.a ! A.href (toValue url) $ renderInlines ils
renderInline (Image ils url tit) =
if T.null tit then base else base ! A.title (toValue tit)
where base = H.img ! A.src (toValue url)
! A.alt (toValue $ BT.renderHtml -- TODO strip tags
$ renderInlines ils)
renderInline (Entity t) = H.preEscapedToMarkup t
renderInline (RawHtml t) = H.preEscapedToMarkup t
renderInline (Markdown t) = toHtml t -- shouldn't happen
renderInline (Err t e) = H.span ! A.class_ "error"
! A.title (toValue e)
$ toHtml t
convert :: [String] -> Text -> IO ()
convert opts t = case parseMarkdown (t <> "\n") of
Left e -> error (show e)
Right r -> render r
where render = if "-n" `elem` opts
then print
else BL.putStr . renderHtml . renderBlocks
-- main loop
main :: IO ()
main = do
args <- getArgs
let isOpt ('-':_) = True
isOpt _ = False
let (opts, files) = partition isOpt args
case files of
[] -> T.getContents >>= convert opts
_ -> mapM T.readFile files >>= convert opts . joinLines
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment