Last active
May 11, 2017 17:11
-
-
Save tfausak/f0581d40cc7d308aa93940cd531ccc94 to your computer and use it in GitHub Desktop.
Use Pandoc to check for broken links in any document.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env stack | |
{- | |
stack | |
--resolver lts-8.13 | |
--install-ghc | |
script | |
--package bytestring | |
--package containers | |
--package data-default | |
--package Glob | |
--package http-client | |
--package http-client-tls | |
--package http-types | |
--package pandoc | |
-} | |
-- | This executable finds broken links in any document. Use it like this: | |
-- stack link-rot.hs PATTERN TYPE | |
-- For example: | |
-- stack link-rot.hs '**/*.md' 'markdown_github' | |
module Main | |
( main | |
) where | |
import qualified Control.Exception as Exception | |
import qualified Control.Monad as Monad | |
import qualified Data.ByteString.Lazy as ByteString | |
import qualified Data.Default as Default | |
import qualified Data.List as List | |
import qualified Data.Set as Set | |
import qualified Network.HTTP.Client as Client | |
import qualified Network.HTTP.Client.TLS as TLS | |
import qualified Network.HTTP.Types as HTTP | |
import qualified System.Environment as Environment | |
import qualified System.FilePath.Glob as Glob | |
import qualified Text.Pandoc as Pandoc | |
main :: IO () | |
main = do | |
[pattern, format] <- Environment.getArgs | |
files <- Glob.glob pattern | |
let Just reader = lookup format Pandoc.readers | |
putStrLn "Looking for links..." | |
allLinks <- Monad.foldM (getLinks reader) [] files | |
let uniqueLinks = Set.fromList allLinks | |
let httpLinks = Set.filter isHttp uniqueLinks | |
let links = Set.toAscList httpLinks | |
putStrLn ("Found " ++ pluralize "link" (length links) ++ ".") | |
putStrLn "Checking links..." | |
manager <- Client.newManager TLS.tlsManagerSettings | |
Monad.forM_ links (checkLink manager) | |
getLinks :: Pandoc.Reader -> [String] -> FilePath -> IO [String] | |
getLinks reader links file = do | |
Right document <- runReader reader file | |
let newLinks = getDocumentLinks document | |
pure (newLinks ++ links) | |
isHttp :: String -> Bool | |
isHttp = List.isPrefixOf "http" | |
checkLink :: Client.Manager -> String -> IO () | |
checkLink manager link = do | |
request <- Client.parseRequest link | |
Exception.catch | |
(do response <- Client.httpNoBody request manager | |
let status = HTTP.statusCode (Client.responseStatus response) | |
putStrLn (show status ++ " " ++ link) | |
Client.responseClose response) | |
(\e -> putStrLn (concat | |
[ "ERROR: " | |
, show (e :: Client.HttpException) | |
, " (" | |
, show link | |
, ")" | |
])) | |
runReader :: Pandoc.Reader | |
-> FilePath | |
-> IO (Either Pandoc.PandocError Pandoc.Pandoc) | |
runReader reader file = | |
case reader of | |
Pandoc.ByteStringReader f -> do | |
contents <- ByteString.readFile file | |
let ignoreMedia = fmap (fmap fst) | |
ignoreMedia (f pandocOptions contents) | |
Pandoc.StringReader f -> do | |
contents <- readFile file | |
f pandocOptions contents | |
pandocOptions :: Pandoc.ReaderOptions | |
pandocOptions = Default.def | |
getDocumentLinks :: Pandoc.Pandoc -> [String] | |
getDocumentLinks document = | |
case document of | |
Pandoc.Pandoc _ blocks -> concatMap getBlockLinks blocks | |
getBlockLinks :: Pandoc.Block -> [String] | |
getBlockLinks block = | |
case block of | |
Pandoc.BlockQuote blocks -> concatMap getBlockLinks blocks | |
Pandoc.BulletList blocks -> concatMap getBlockLinks (concat blocks) | |
Pandoc.CodeBlock _ _ -> [] | |
Pandoc.DefinitionList definitions -> | |
let (rawInlines, rawBlocks) = unzip definitions | |
inlines = concat rawInlines | |
blocks = concat (concat rawBlocks) | |
inlineLinks = concatMap getInlineLinks inlines | |
blockLinks = concatMap getBlockLinks blocks | |
in inlineLinks ++ blockLinks | |
Pandoc.Div _ blocks -> concatMap getBlockLinks blocks | |
Pandoc.Header _ _ inlines -> concatMap getInlineLinks inlines | |
Pandoc.HorizontalRule -> [] | |
Pandoc.LineBlock inlines -> concatMap (concatMap getInlineLinks) inlines | |
Pandoc.Null -> [] | |
Pandoc.OrderedList _ blocks -> concatMap getBlockLinks (concat blocks) | |
Pandoc.Para inlines -> concatMap getInlineLinks inlines | |
Pandoc.Plain inlines -> concatMap getInlineLinks inlines | |
Pandoc.RawBlock _ _ -> [] | |
Pandoc.Table inlines _ _ rawHeaders rawRows -> | |
let inlineLinks = concatMap getInlineLinks inlines | |
headers = concat rawHeaders | |
headerLinks = concatMap getBlockLinks headers | |
rows = concat (concat rawRows) | |
rowLinks = concatMap getBlockLinks rows | |
in concat [inlineLinks, headerLinks, rowLinks] | |
getInlineLinks :: Pandoc.Inline -> [String] | |
getInlineLinks inline = | |
case inline of | |
Pandoc.Cite citations inlines -> | |
let prefixes = concatMap Pandoc.citationPrefix citations | |
suffixes = concatMap Pandoc.citationSuffix citations | |
prefixLinks = concatMap getInlineLinks prefixes | |
suffixLinks = concatMap getInlineLinks suffixes | |
inlineLinks = concatMap getInlineLinks inlines | |
in concat [prefixLinks, suffixLinks, inlineLinks] | |
Pandoc.Code _ _ -> [] | |
Pandoc.Emph inlines -> concatMap getInlineLinks inlines | |
Pandoc.Image _ inlines target -> | |
fst target : concatMap getInlineLinks inlines | |
Pandoc.LineBreak -> [] | |
Pandoc.Link _ inlines target -> | |
fst target : concatMap getInlineLinks inlines | |
Pandoc.Math _ _ -> [] | |
Pandoc.Note blocks -> concatMap getBlockLinks blocks | |
Pandoc.Quoted _ inlines -> concatMap getInlineLinks inlines | |
Pandoc.RawInline _ _ -> [] | |
Pandoc.SmallCaps inlines -> concatMap getInlineLinks inlines | |
Pandoc.SoftBreak -> [] | |
Pandoc.Space -> [] | |
Pandoc.Span _ inlines -> concatMap getInlineLinks inlines | |
Pandoc.Str _ -> [] | |
Pandoc.Strikeout inlines -> concatMap getInlineLinks inlines | |
Pandoc.Strong inlines -> concatMap getInlineLinks inlines | |
Pandoc.Subscript inlines -> concatMap getInlineLinks inlines | |
Pandoc.Superscript inlines -> concatMap getInlineLinks inlines | |
pluralize | |
:: (Integral a, Show a) | |
=> String -> a -> String | |
pluralize word amount = | |
case amount of | |
1 -> "1 " ++ word | |
_ -> show amount ++ " " ++ word ++ "s" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment