Skip to content

Instantly share code, notes, and snippets.

@picasso250
Created May 14, 2013 03:16
Show Gist options
  • Save picasso250/5573413 to your computer and use it in GitHub Desktop.
Save picasso250/5573413 to your computer and use it in GitHub Desktop.
<?php
ini_set('display_errors', 1);
error_reporting(E_ALL | E_STRICT);
set_time_limit(1000);
$root_url = 'http://mitpress.mit.edu/sicp/full-text/book/';
$start_url = 'http://mitpress.mit.edu/sicp/full-text/book/book.html';
$save_root = 'sicp';
download_all_page($root_url, $start_url, $save_root);
function download_all_page($root_url, $start_url, $save_root)
{
$visited = array();
$to_visit = array($start_url => 1);
if (!file_exists($save_root)) {
mkdir($save_root);
}
$_save_root = $save_root[strlen($save_root)-1] == '/' ? $save_root : $save_root.'/';
while ($to_visit) {
download_page_iter($root_url, key($to_visit), $_save_root, $visited, $to_visit);
}
echo count($visited), ' page download';
}
function download_page_iter($root_url, $url, $save_root, &$visited, &$to_visit)
{
$html = fetch_content($url);
$fname = substr($url, strlen($root_url));
if ($fname) {
if (file_put_contents($save_root.$fname, $html)) {
$visited[$url] = 1;
unset($to_visit[$url]);
} else {
die("fail to write $fname");
}
}
static $images = array();
$all_images = get_all_images($html, $url);
foreach ($all_images as $src) {
$image_src = $root_url.$src;
if (!isset($images[$image_src])) {
$content = fetch_content($image_src);
if (!file_put_contents($save_root.$src, $content)) {
die('fail to save image: '.$src);
}
$images[$image_src] = 1;
}
}
$all_links = get_all_links($html, $url);
foreach ($all_links as $href) {
$page_url = substr($href, 0, strpos($href, '#'));
if (strpos($page_url, $root_url) !== false && !isset($visited[$page_url])) {
$to_visit[$page_url] = 1;
}
}
}
function fetch_content($url)
{
$ch = curl_init($url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
$content = curl_exec($ch);
if (curl_errno($ch)) {
die('curl error: '.curl_error($ch));
}
curl_close($ch);
return $content;
}
function get_all_links($html, $page_url)
{
if (preg_match_all('%<a href="(.+?)">%', $html, $matches)) {
$u = parse_url($page_url);
$base_url = (isset($u['scheme']) ? $u['scheme'] : 'http').'://'.$u['host'].dirname($u['path']);
foreach ($matches[0] as $k => $v) {
$ret[] = $base_url.'/'.$matches[1][$k];
}
return $ret;
}
return array();
}
function get_all_images($html, $page_url)
{
if (preg_match_all('%<img\s.*?\bsrc="(.+?)".*?>%', $html, $matches)) {
foreach ($matches[0] as $k => $v) {
$ret[] = $matches[1][$k];
}
return $ret;
}
return array();
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment