Skip to content

Instantly share code, notes, and snippets.

@clrnd
Created April 1, 2016 19:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save clrnd/3f5221fe2b053265b0b935da8bc374f8 to your computer and use it in GitHub Desktop.
Save clrnd/3f5221fe2b053265b0b935da8bc374f8 to your computer and use it in GitHub Desktop.
Scalpel scraper for Dr. Who IMDB ratings.
-- stack --resolver lts-5.8 runghc --package "aeson scalpel bytestring text"
{-# LANGUAGE LambdaCase #-}
module Main where
import Control.Monad (forM)
import Text.HTML.Scalpel
import Data.Aeson
import Data.Text (Text, unpack, pack, strip)
import qualified Data.Text.IO as T
import qualified Data.ByteString.Lazy as B
data PreEpisode = PreEpisode
{ preTitle :: Text
, preNumber :: Int
, preUrl :: URL
, preDate :: Text
} deriving Show
data Episode = Episode
{ epTitle :: Text
, epSeason :: Int
, epNumber :: Int
, epScore :: Float
, epDate :: Text
} deriving Show
instance ToJSON Episode where
toJSON (Episode t ss n s d) = object [ pack "title" .= t
, pack "season" .= ss
, pack "number" .= n
, pack "score" .= s
, pack "date" .= show d ]
main :: IO ()
main = do
seasons <- forM [1..9] getSeason
B.writeFile "output.json" (encode seasons)
getSeason :: Int -> IO [Episode]
getSeason season = do
contents <- T.readFile ("episodes?season=" ++ show season)
case scrapeStringLike contents pre_eps of
Nothing -> error "lolz"
Just pre_eps' -> forM pre_eps' $ \pre_ep' -> do
putStrLn $ "scrapping season " ++ show season ++
", episode " ++ (show . preNumber) pre_ep'
scrapeURL ("http://www.imdb.com/" ++ preUrl pre_ep') score >>= \case
Nothing -> error $ "nou rating: " ++ show pre_ep'
Just score' -> return $ merge pre_ep' score' season
merge :: PreEpisode -> Float -> Int -> Episode
merge (PreEpisode t n _ d) s ss = Episode t ss n s d
score :: Scraper Text Float
score = fmap (read . unpack) . text $ "span" @: ["itemprop" @= "ratingValue"]
pre_eps :: Scraper Text [PreEpisode]
pre_eps = chroots ("div" @: ["itemprop" @= "episodes"]) pre_ep
pre_ep :: Scraper Text PreEpisode
pre_ep = PreEpisode <$> title <*> number <*> url <*> date
where
title :: Scraper Text Text
title = attr "title" $ "a" @: ["itemprop" @= "name"]
url :: Scraper Text String
url = fmap unpack . attr "href" $ "a" @: ["itemprop" @= "name"]
number :: Scraper Text Int
number = fmap (read . unpack) . attr "content" $ "meta" @: ["itemprop" @= "episodeNumber"]
date :: Scraper Text Text
date = fmap strip . text $ "div" @: ["class" @= "airdate"]
@clrnd
Copy link
Author

clrnd commented Apr 1, 2016

The initial season files where dowloaded using:

for x in {1..10}; do wget "http://www.imdb.com/title/tt0436992/episodes?season=$x"; done

Blog post: http://clrnd.com.ar/posts/2016-03-15-dr-who-imdb-ratings.html

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment