Skip to content

Instantly share code, notes, and snippets.

@symm symm/composer.json
Created Nov 8, 2012

Embed
What would you like to do?
Returns the episode titles a certain American Podcast series
{
"require":
{
"symfony/dom-crawler": "2.2.x-dev",
"symfony/css-selector": "2.2.x-dev"
}
}
<?php
require('vendor/autoload.php');
use Symfony\Component\DomCrawler\Crawler;
$mainPage = downloadPage('http://www.thisamericanlife.org/radio-archives');
$crawler = new Crawler($mainPage);
$baseUrl = "http://www.thisamericanlife.org";
$episodes = array();
$urlsToScrape = array();
// Read the contents of the Navigation bar so we know which pages to download.
$yearNodes = $crawler->filter('#archive-date-nav li a')->reduce(function($node, $i) {});
foreach($yearNodes as $yearNode) {
$urlsToScrape[] = $baseUrl . $yearNode->getAttribute('href');
}
// Go through each archived Year and pull out the episode titles.
foreach ($urlsToScrape as $url) {
$page = downloadPage($url);
$crawler = new Crawler($page);
$showNodes = $crawler->filter('.episode-archive > h3 a')->reduce(function($node, $i) {});
// Read the episode title for the shows.
foreach($showNodes as $showNode) {
$bits = explode(':', $showNode->nodeValue);
$epNum = $bits[0];
$episodes[$epNum] = $showNode->nodeValue;
}
}
ksort($episodes);
foreach ($episodes as $episode) {
echo $episode . PHP_EOL;
}
function downloadPage($url) {
// Simple caching.
$cacheDir = 'cache';
$cacheExpiry = 60 * 60 * 24 * 7; // One week
$urlHash = sha1($url);
$cacheFile = $cacheDir . '/' . $urlHash;
if (!file_exists($cacheDir)) {
mkdir($cacheDir);
}
if(file_exists($cacheFile) && (time() - filemtime($cacheFile)) < $cacheExpiry){
return file_get_contents($cacheFile);
} else {
$page = file_get_contents($url);
file_put_contents($cacheFile, $page);
return $page;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.