Created
June 9, 2013 09:44
-
-
Save inkless/5742978 to your computer and use it in GitHub Desktop.
抓取静态网站的全部静态资源和内容
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
ini_set('display_errors', 'On'); | |
$theme_id = $_SERVER['HTTP_HOST'] ? $_REQUEST['theme_id'] : $argv[1]; | |
if (!$theme_id) die("No Theme ID specified!"); | |
$target_url = $_SERVER['HTTP_HOST'] ? $_REQUEST['target'] : $argv[2]; | |
$target_url = $target_url ?: "http://wbpreview.com/previews/"; | |
$index_page = $_SERVER['HTTP_HOST'] ? $_REQUEST['index_page'] : $argv[3]; | |
$index_page = $index_page ? $index_page : "index"; | |
$short_html = $_SERVER['HTTP_HOST'] ? $_REQUEST['short_html'] : $argv[4]; | |
define("HTML_TYPE", $short_html ? ".htm" : ".html"); | |
define("THEME_ID", $theme_id); | |
define("ROOT", __DIR__."/themes/".THEME_ID); | |
define("TARGET_URL", $target_url); | |
if ($_SERVER['HTTP_HOST']) define('LINE', "<br />"); | |
else define('LINE', "\n"); | |
if ($_SERVER['HTTP_HOST']) echo str_pad('Downloading... ',4096).LINE; | |
else echo "Downloading... ".LINE; | |
flush(); ob_flush(); | |
if (!is_dir(ROOT)) | |
mkdir(ROOT); | |
class PAGES { | |
public static $all = array(); | |
public static $done = array(); | |
} | |
array_push(PAGES::$all, $index_page); | |
download_by_page(); | |
function download_by_page() { | |
if (!count(PAGES::$all)) { | |
echo "全部下载完毕!".LINE; | |
return; | |
} | |
$cur_page = array_pop(PAGES::$all); | |
echo "{$cur_page} 页面下载开始...".LINE; | |
$page_url = TARGET_URL.THEME_ID."/{$cur_page}".HTML_TYPE; | |
$html = get_http_data($page_url); | |
// $file_name = substr($page_url, strpos($page_url, THEME_ID)+strlen(THEME_ID)+1); | |
$file_name = $cur_page.HTML_TYPE; | |
if (($last_slash = strrpos($file_name, "/")) !== false) { | |
$ex_path = "/".substr($file_name, 0, $last_slash); | |
$path = ROOT.$ex_path; | |
if (!is_dir($path)) mkdir($path, 0777, true); | |
} else { | |
$ex_path = ""; | |
} | |
if (!is_file(ROOT."/".$file_name)) file_put_contents(ROOT."/".$file_name, $html); | |
/** | |
* 匹配 a标签 | |
*/ | |
preg_match_all('/<a .*?href[\s]*=[\s]*["\']([^"\']+)["\'].*>/i', $html, $matches); | |
if ($matches && $matches[1]) { | |
$links = $matches[1]; | |
foreach ($links as $l) { | |
if (($hash_pos = strpos($l, "#")) !== false) { | |
$l = substr($l, 0, $hash_pos); | |
} | |
if (($q_pos = strpos($l, "?")) !== false) { | |
$l = substr($l, 0, $q_pos); | |
} | |
if (strlen($l) > (strlen(HTML_TYPE) - 1) && substr($l, strlen($l) - strlen(HTML_TYPE)) == HTML_TYPE) { | |
$l = substr($l, 0, strlen($l) - strlen(HTML_TYPE)); | |
// 如果不在完成里面,且不在all里面,且不是当前的,加入到all里面 | |
if (!in_array($l, PAGES::$done) && !in_array($l, PAGES::$all) && $l != $cur_page) | |
array_push(PAGES::$all, $l); | |
} | |
} | |
} | |
/** | |
* 匹配link和script | |
*/ | |
preg_match_all('/<link [^>]*href[\s]*=[\s]*["\']([^"\']+)["\']/i', $html, $matches); | |
if ($matches && $matches[1]) { | |
$css_files = $matches[1]; | |
download_files($css_files, $ex_path); | |
investigate_css_files($css_files, $ex_path); | |
} | |
preg_match_all('/<script [^>]*src[\s]*=[\s]*["\']([^"\']+)["\']/i', $html, $matches); | |
if ($matches && $matches[1]) { | |
$js_files = $matches[1]; | |
download_files($js_files, $ex_path); | |
} | |
preg_match_all('/<img [^>]*src[\s]*=[\s]*["\']([^"\']+)["\']/i', $html, $matches); | |
if ($matches && $matches[1]) { | |
$img_files = $matches[1]; | |
download_files($img_files, $ex_path); | |
} | |
preg_match_all('/url\(["\']?([^\)"\']+)[\'"]?\)/i', $html, $matches); | |
if ($matches && $matches[1]) { | |
$bg_files = $matches[1]; | |
download_files($bg_files, $ex_path); | |
} | |
echo "{$cur_page} 页面下载完成!".LINE; | |
array_push(PAGES::$done, $cur_page); | |
// foreach (PAGES::$all as $key=>$value) { | |
// if ($value == $cur_page) unset(PAGES::$all[$key]); | |
// } | |
download_by_page(); | |
} | |
function download_files($files, $base_path="") { | |
foreach ($files as $v) { | |
if (strpos($v, "data:image") !== false) continue; | |
// 如果不是 http打头的 | |
if (strpos($v, "http://") !== 0 && strpos($v, "https://") !== 0) { | |
// 如果存在 / 的路径 | |
if (($last_slash = strrpos($v, "/")) !== false) { | |
$path = ROOT.$base_path."/".substr($v, 0, $last_slash); | |
if (is_file($path)) continue; | |
if (!is_dir($path)) mkdir($path, 0777, true); | |
} | |
$url = TARGET_URL.THEME_ID.$base_path."/".$v; | |
$to = ROOT.$base_path."/".$v; | |
if (($hash_pos = strpos($to, "#")) !== false) { | |
$to = substr($to, 0, $hash_pos); | |
} | |
if (($q_pos = strpos($to, "?")) !== false) { | |
$to = substr($to, 0, $q_pos); | |
} | |
if (is_file($to)) { | |
echo $url." 已存在,使用缓存文件!"; | |
} else { | |
$text = get_http_data($url); | |
$r = file_put_contents($to, $text); | |
echo $url." 下载成功,大小:".$r; | |
} | |
echo LINE; | |
flush(); | |
ob_flush(); | |
} | |
} | |
} | |
function investigate_css_files($files, $ex_path) { | |
if (!$files) return; | |
foreach ($files as $v) { | |
if (strpos($v, "data:image") !== false) continue; | |
if (strpos($v, "http://") !== false) continue; | |
$from = ROOT.$ex_path."/".$v; | |
if (($last_slash = strrpos($v, "/")) !== false) | |
$path = $ex_path."/".substr($v, 0, $last_slash); | |
else | |
$path = $ex_path; | |
if (($hash_pos = strpos($from, "#")) !== false) { | |
$from = substr($from, 0, $hash_pos); | |
} | |
if (($q_pos = strpos($from, "?")) !== false) { | |
$from = substr($from, 0, $q_pos); | |
} | |
$html = file_get_contents($from); | |
preg_match_all('/url\(["\']?([^\)"\']+)[\'"]?\)/i', $html, $matches); | |
if ($matches && $matches[1]) { | |
$bg_files = $matches[1]; | |
download_files($bg_files, $path); | |
if (is_array($bg_files)) { | |
investigate_css_files($bg_files, $path); | |
} | |
} | |
} | |
} | |
function get_http_data($requestURL, $params = array(), $cookiePath = '', $timeout = 15) { | |
if (extension_loaded('curl')) { | |
$post = ''; | |
$postUrl = parse_url($requestURL); | |
//$params['post'] = FALSE; | |
$params['return'] = TRUE; | |
$params['cookie'] = $cookiePath; | |
$ch = curl_init(); | |
if ($postUrl['scheme'] == 'https'); { | |
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 2); | |
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 1); | |
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE); | |
} | |
curl_setopt($ch, CURLOPT_TIMEOUT, $timeout); | |
curl_setopt($ch, CURLOPT_URL, $requestURL); | |
curl_setopt($ch, CURLOPT_RETURNTRANSFER, $params['return']); | |
curl_setopt($ch, CURLOPT_COOKIEJAR, $params['cookie']); | |
curl_setopt($ch, CURLOPT_COOKIEFILE, $params['cookie']); | |
if ($params['post'] != FALSE) { | |
curl_setopt($ch, CURL_POST, TRUE); | |
if (is_array($params['post'])) { | |
$post = http_build_query($params['post']); | |
} else { | |
$post = $params['post']; | |
} | |
curl_setopt($ch, CURLOPT_POSTFIELDS, $post); | |
} | |
$data = curl_exec($ch); | |
curl_close($ch); | |
if (is_null($data)) { | |
return false; | |
} | |
return $data; | |
} else { | |
return false; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment