Skip to content

Instantly share code, notes, and snippets.

@Fuuzetsu
Created January 6, 2014 01:05
Show Gist options
  • Save Fuuzetsu/8276445 to your computer and use it in GitHub Desktop.
Save Fuuzetsu/8276445 to your computer and use it in GitHub Desktop.
Scrapes most-recent versions of all Hackage packages for the status of their documentation. Note that if you want to use this, you'll have to change the file path to save to at the top of the program.
{-# LANGUAGE LambdaCase #-}
module Main where
import Control.Monad ((>=>))
import Data.Either (rights, lefts)
import Data.Tree.NTree.TypeDefs (NTree)
import Data.List (isPrefixOf, isInfixOf)
import Network.HTTP (simpleHTTP, getRequest, rspBody)
import Network.Stream (ConnError)
import Text.XML.HXT.Core hiding (mkName)
resultFile :: String
resultFile = "/home/shana/hackageresult"
parsePackages
:: (ArrowXml cat, ArrowChoice cat) => cat XmlTree [Char]
parsePackages = hasName "a" >>> getAttrValue "href"
>>> arr (\x -> if "/package/" `isPrefixOf` x
then Left $ prependHackage x
else Right ())
>>> returnA ||| zeroArrow
parseVersion :: ArrowXml a => a XmlTree String
parseVersion = hasName "tr" //> hasName "strong" /> getText
parseDate :: (ArrowXml cat, ArrowChoice cat) => cat XmlTree String
parseDate = hasName "tr" //> getText
>>> arr (\x -> if isDate x
then Left x
else Right ())
>>> returnA ||| zeroArrow
-- try to guess if something's a date
isDate :: String -> Bool
isDate s = length (words s) == 6 && ":" `isInfixOf` (last $ take 4 (words s))
parseModules :: ArrowXml a => a XmlTree (Either String String)
parseModules = hasAttrValue "class" (== "module")
/> ifA isText (getText >>^ Left)
(hasName "a" >>> getAttrValue "href" >>^ Right)
appendReport :: String -> String -> String
appendReport u v = (u ++ "-" ++ v ++ "/reports/")
prependHackage :: String -> String
prependHackage = ("http://hackage.haskell.org" ++)
packageUrl :: String
packageUrl = "http://hackage.haskell.org/packages/"
fetch :: String -> IO (Either ConnError String)
fetch = simpleHTTP . getRequest >=> return . fmap rspBody
parseHackage :: Maybe [String] -> IO [(String, PackageStatus)]
parseHackage urls = case urls of
Nothing -> fetch packageUrl >>= \case
Left _ -> return []
Right pp -> case runL parsePackages pp of
[] -> return []
xs -> onPackage xs
Just xs -> onPackage xs
where
onPackage xs = do
let n = length xs
flip mapM (zip xs [1 .. ]) $ \(p, cn) -> do
putStrLn $ "[" ++ show cn ++ "/" ++ show n ++ "] " ++ "Processing " ++ p
r <- processPackage p
appendFile resultFile $ show (p, r) ++ "\n"
return (p, r)
data PackageStatus = AllGood | NoDetectedModules
| MissingDocs (Maybe String) String String
| MixedDocs
| NetworkProblem String
| ParsingProblem String
deriving (Eq, Show)
runL :: LA (NTree XNode) b -> String -> [b]
runL p = runLA (xreadDoc //> p)
processPackage :: String -> IO PackageStatus
processPackage u = do
ps <- fetch u >>= \case
Left e -> return . Left . NetworkProblem $ show e
Right s -> return $ case runL parseVersion s of
[] -> Left $ ParsingProblem "Couldn't parse version"
v:_ -> Right (v, s, concat $ runL parseDate s)
case ps of
Left e -> return e
Right (v, s, d) -> case runL parseModules s of
[] -> return NoDetectedModules
xs -> case (lefts xs, rights xs) of
([], []) -> return NoDetectedModules
([], _) -> return AllGood
(_, []) -> fetch (appendReport u v) >>= \case
Left _ -> return $ MissingDocs Nothing v d
Right s' -> return $ MissingDocs (parseFailReason s') v d
_ -> return MixedDocs
parseFailReason :: String -> Maybe String
parseFailReason [] = Nothing
parseFailReason s
| "installOutcome = " `isPrefixOf` s = Just $
takeWhile (\x -> x /= ',' && x /= ' ')
(drop (length "installOutcome = ") s)
| otherwise = parseFailReason $ drop 1 s
main :: IO ()
main = parseHackage Nothing >> putStrLn "Done"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment