public
Created

Scrapes most-recent versions of all Hackage packages for the status of their documentation. Note that if you want to use this, you'll have to change the file path to save to at the top of the program.

  • Download Gist
H.hs
Haskell
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111
{-# LANGUAGE LambdaCase #-}
module Main where
 
import Control.Monad ((>=>))
import Data.Either (rights, lefts)
import Data.Tree.NTree.TypeDefs (NTree)
import Data.List (isPrefixOf, isInfixOf)
import Network.HTTP (simpleHTTP, getRequest, rspBody)
import Network.Stream (ConnError)
import Text.XML.HXT.Core hiding (mkName)
 
resultFile :: String
resultFile = "/home/shana/hackageresult"
 
parsePackages
:: (ArrowXml cat, ArrowChoice cat) => cat XmlTree [Char]
parsePackages = hasName "a" >>> getAttrValue "href"
>>> arr (\x -> if "/package/" `isPrefixOf` x
then Left $ prependHackage x
else Right ())
>>> returnA ||| zeroArrow
 
parseVersion :: ArrowXml a => a XmlTree String
parseVersion = hasName "tr" //> hasName "strong" /> getText
 
parseDate :: (ArrowXml cat, ArrowChoice cat) => cat XmlTree String
parseDate = hasName "tr" //> getText
>>> arr (\x -> if isDate x
then Left x
else Right ())
>>> returnA ||| zeroArrow
 
-- try to guess if something's a date
isDate :: String -> Bool
isDate s = length (words s) == 6 && ":" `isInfixOf` (last $ take 4 (words s))
 
parseModules :: ArrowXml a => a XmlTree (Either String String)
parseModules = hasAttrValue "class" (== "module")
/> ifA isText (getText >>^ Left)
(hasName "a" >>> getAttrValue "href" >>^ Right)
 
appendReport :: String -> String -> String
appendReport u v = (u ++ "-" ++ v ++ "/reports/")
 
prependHackage :: String -> String
prependHackage = ("http://hackage.haskell.org" ++)
 
packageUrl :: String
packageUrl = "http://hackage.haskell.org/packages/"
 
fetch :: String -> IO (Either ConnError String)
fetch = simpleHTTP . getRequest >=> return . fmap rspBody
 
parseHackage :: Maybe [String] -> IO [(String, PackageStatus)]
parseHackage urls = case urls of
Nothing -> fetch packageUrl >>= \case
Left _ -> return []
Right pp -> case runL parsePackages pp of
[] -> return []
xs -> onPackage xs
Just xs -> onPackage xs
 
where
onPackage xs = do
let n = length xs
flip mapM (zip xs [1 .. ]) $ \(p, cn) -> do
putStrLn $ "[" ++ show cn ++ "/" ++ show n ++ "] " ++ "Processing " ++ p
r <- processPackage p
appendFile resultFile $ show (p, r) ++ "\n"
return (p, r)
 
data PackageStatus = AllGood | NoDetectedModules
| MissingDocs (Maybe String) String String
| MixedDocs
| NetworkProblem String
| ParsingProblem String
deriving (Eq, Show)
 
runL :: LA (NTree XNode) b -> String -> [b]
runL p = runLA (xreadDoc //> p)
 
processPackage :: String -> IO PackageStatus
processPackage u = do
ps <- fetch u >>= \case
Left e -> return . Left . NetworkProblem $ show e
Right s -> return $ case runL parseVersion s of
[] -> Left $ ParsingProblem "Couldn't parse version"
v:_ -> Right (v, s, concat $ runL parseDate s)
 
case ps of
Left e -> return e
Right (v, s, d) -> case runL parseModules s of
[] -> return NoDetectedModules
xs -> case (lefts xs, rights xs) of
([], []) -> return NoDetectedModules
([], _) -> return AllGood
(_, []) -> fetch (appendReport u v) >>= \case
Left _ -> return $ MissingDocs Nothing v d
Right s' -> return $ MissingDocs (parseFailReason s') v d
_ -> return MixedDocs
 
parseFailReason :: String -> Maybe String
parseFailReason [] = Nothing
parseFailReason s
| "installOutcome = " `isPrefixOf` s = Just $
takeWhile (\x -> x /= ',' && x /= ' ')
(drop (length "installOutcome = ") s)
| otherwise = parseFailReason $ drop 1 s
 
main :: IO ()
main = parseHackage Nothing >> putStrLn "Done"

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.