Skip to content

Instantly share code, notes, and snippets.

@inkless
Created June 9, 2013 09:44
Show Gist options
  • Save inkless/5742978 to your computer and use it in GitHub Desktop.
Save inkless/5742978 to your computer and use it in GitHub Desktop.
抓取静态网站的全部静态资源和内容
<?php
ini_set('display_errors', 'On');
$theme_id = $_SERVER['HTTP_HOST'] ? $_REQUEST['theme_id'] : $argv[1];
if (!$theme_id) die("No Theme ID specified!");
$target_url = $_SERVER['HTTP_HOST'] ? $_REQUEST['target'] : $argv[2];
$target_url = $target_url ?: "http://wbpreview.com/previews/";
$index_page = $_SERVER['HTTP_HOST'] ? $_REQUEST['index_page'] : $argv[3];
$index_page = $index_page ? $index_page : "index";
$short_html = $_SERVER['HTTP_HOST'] ? $_REQUEST['short_html'] : $argv[4];
define("HTML_TYPE", $short_html ? ".htm" : ".html");
define("THEME_ID", $theme_id);
define("ROOT", __DIR__."/themes/".THEME_ID);
define("TARGET_URL", $target_url);
if ($_SERVER['HTTP_HOST']) define('LINE', "<br />");
else define('LINE', "\n");
if ($_SERVER['HTTP_HOST']) echo str_pad('Downloading... ',4096).LINE;
else echo "Downloading... ".LINE;
flush(); ob_flush();
if (!is_dir(ROOT))
mkdir(ROOT);
class PAGES {
public static $all = array();
public static $done = array();
}
array_push(PAGES::$all, $index_page);
download_by_page();
function download_by_page() {
if (!count(PAGES::$all)) {
echo "全部下载完毕!".LINE;
return;
}
$cur_page = array_pop(PAGES::$all);
echo "{$cur_page} 页面下载开始...".LINE;
$page_url = TARGET_URL.THEME_ID."/{$cur_page}".HTML_TYPE;
$html = get_http_data($page_url);
// $file_name = substr($page_url, strpos($page_url, THEME_ID)+strlen(THEME_ID)+1);
$file_name = $cur_page.HTML_TYPE;
if (($last_slash = strrpos($file_name, "/")) !== false) {
$ex_path = "/".substr($file_name, 0, $last_slash);
$path = ROOT.$ex_path;
if (!is_dir($path)) mkdir($path, 0777, true);
} else {
$ex_path = "";
}
if (!is_file(ROOT."/".$file_name)) file_put_contents(ROOT."/".$file_name, $html);
/**
* 匹配 a标签
*/
preg_match_all('/<a .*?href[\s]*=[\s]*["\']([^"\']+)["\'].*>/i', $html, $matches);
if ($matches && $matches[1]) {
$links = $matches[1];
foreach ($links as $l) {
if (($hash_pos = strpos($l, "#")) !== false) {
$l = substr($l, 0, $hash_pos);
}
if (($q_pos = strpos($l, "?")) !== false) {
$l = substr($l, 0, $q_pos);
}
if (strlen($l) > (strlen(HTML_TYPE) - 1) && substr($l, strlen($l) - strlen(HTML_TYPE)) == HTML_TYPE) {
$l = substr($l, 0, strlen($l) - strlen(HTML_TYPE));
// 如果不在完成里面,且不在all里面,且不是当前的,加入到all里面
if (!in_array($l, PAGES::$done) && !in_array($l, PAGES::$all) && $l != $cur_page)
array_push(PAGES::$all, $l);
}
}
}
/**
* 匹配link和script
*/
preg_match_all('/<link [^>]*href[\s]*=[\s]*["\']([^"\']+)["\']/i', $html, $matches);
if ($matches && $matches[1]) {
$css_files = $matches[1];
download_files($css_files, $ex_path);
investigate_css_files($css_files, $ex_path);
}
preg_match_all('/<script [^>]*src[\s]*=[\s]*["\']([^"\']+)["\']/i', $html, $matches);
if ($matches && $matches[1]) {
$js_files = $matches[1];
download_files($js_files, $ex_path);
}
preg_match_all('/<img [^>]*src[\s]*=[\s]*["\']([^"\']+)["\']/i', $html, $matches);
if ($matches && $matches[1]) {
$img_files = $matches[1];
download_files($img_files, $ex_path);
}
preg_match_all('/url\(["\']?([^\)"\']+)[\'"]?\)/i', $html, $matches);
if ($matches && $matches[1]) {
$bg_files = $matches[1];
download_files($bg_files, $ex_path);
}
echo "{$cur_page} 页面下载完成!".LINE;
array_push(PAGES::$done, $cur_page);
// foreach (PAGES::$all as $key=>$value) {
// if ($value == $cur_page) unset(PAGES::$all[$key]);
// }
download_by_page();
}
function download_files($files, $base_path="") {
foreach ($files as $v) {
if (strpos($v, "data:image") !== false) continue;
// 如果不是 http打头的
if (strpos($v, "http://") !== 0 && strpos($v, "https://") !== 0) {
// 如果存在 / 的路径
if (($last_slash = strrpos($v, "/")) !== false) {
$path = ROOT.$base_path."/".substr($v, 0, $last_slash);
if (is_file($path)) continue;
if (!is_dir($path)) mkdir($path, 0777, true);
}
$url = TARGET_URL.THEME_ID.$base_path."/".$v;
$to = ROOT.$base_path."/".$v;
if (($hash_pos = strpos($to, "#")) !== false) {
$to = substr($to, 0, $hash_pos);
}
if (($q_pos = strpos($to, "?")) !== false) {
$to = substr($to, 0, $q_pos);
}
if (is_file($to)) {
echo $url." 已存在,使用缓存文件!";
} else {
$text = get_http_data($url);
$r = file_put_contents($to, $text);
echo $url." 下载成功,大小:".$r;
}
echo LINE;
flush();
ob_flush();
}
}
}
function investigate_css_files($files, $ex_path) {
if (!$files) return;
foreach ($files as $v) {
if (strpos($v, "data:image") !== false) continue;
if (strpos($v, "http://") !== false) continue;
$from = ROOT.$ex_path."/".$v;
if (($last_slash = strrpos($v, "/")) !== false)
$path = $ex_path."/".substr($v, 0, $last_slash);
else
$path = $ex_path;
if (($hash_pos = strpos($from, "#")) !== false) {
$from = substr($from, 0, $hash_pos);
}
if (($q_pos = strpos($from, "?")) !== false) {
$from = substr($from, 0, $q_pos);
}
$html = file_get_contents($from);
preg_match_all('/url\(["\']?([^\)"\']+)[\'"]?\)/i', $html, $matches);
if ($matches && $matches[1]) {
$bg_files = $matches[1];
download_files($bg_files, $path);
if (is_array($bg_files)) {
investigate_css_files($bg_files, $path);
}
}
}
}
function get_http_data($requestURL, $params = array(), $cookiePath = '', $timeout = 15) {
if (extension_loaded('curl')) {
$post = '';
$postUrl = parse_url($requestURL);
//$params['post'] = FALSE;
$params['return'] = TRUE;
$params['cookie'] = $cookiePath;
$ch = curl_init();
if ($postUrl['scheme'] == 'https'); {
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 2);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 1);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE);
}
curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
curl_setopt($ch, CURLOPT_URL, $requestURL);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, $params['return']);
curl_setopt($ch, CURLOPT_COOKIEJAR, $params['cookie']);
curl_setopt($ch, CURLOPT_COOKIEFILE, $params['cookie']);
if ($params['post'] != FALSE) {
curl_setopt($ch, CURL_POST, TRUE);
if (is_array($params['post'])) {
$post = http_build_query($params['post']);
} else {
$post = $params['post'];
}
curl_setopt($ch, CURLOPT_POSTFIELDS, $post);
}
$data = curl_exec($ch);
curl_close($ch);
if (is_null($data)) {
return false;
}
return $data;
} else {
return false;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment