Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Save a webpage (and its linked assets) into a single HTML file.
#! /usr/bin/env php
<?php
/***
* PageSnap
*
* Script for archiving a HTML page (and associated media assets) into a single file.
*
* Requires:
*
* - PhpQuery: http://code.google.com/p/phpquery
* - Parallel Curl: http://github.com/petewarden/parallelcurl
*/
require 'parallelcurl.php';
require 'phpQuery.php';
class PageSnap {
public $url=null;
private $doc=null;
//associative array mapping assets URLs to DOM Element
private $asset_urls=array();
private $multi_curl=null;
public $selectors=array(
'link[href]',
'script[src]',
'img[src]',
'object[src]'
//TODO: iframe frame
);
function __construct($url,$filename=null){
$this->url=$url;
$this->filename= $filename ? $filename : preg_replace('@^http://(www.)?@','',$url) . '.html';
$this->doc=phpQuery::newDocument(file_get_contents($url));
$this->multi_curl = new ParallelCurl(10,array(
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_USERAGENT => "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.1) Gecko/20061204 Firefox/2.0.0.1",
));
return $this;
}
/**
* Parallel Curl callback.
*
*/
function set_data_url($response_body,$url,$ch,$element){
if (!in_array($url,$this->asset_urls)){
file_put_contents('php://stderr',"fetching {$url}\n");
$data=array();
$data[]= 'data:' . preg_replace('/;.*$/','',curl_getinfo($ch,CURLINFO_CONTENT_TYPE));
$data[]= 'base64,'.base64_encode($response_body);
foreach(array('href','src') as $attr) if ($element->hasAttribute($attr)) pq($element)->attr($attr,implode(';',$data));
$this->asset_urls[]=$url;
}
}
/**
* Resolve relative URLs.
*
* @author Stefano Faenza
* http://www.stefanoforenza.com/how-to-build-an-absolute-url-in-php/
*/
private function absolute_url($u,$p){
$url = parse_url( $u );
$page = parse_url( $p );
if ( strpos( $u , '/' ) === 0 )
{
//already absolute
} else {
$basePath = '';
if (
isset( $page[ 'path' ] )
&& strpos( ltrim( $page[ 'path' ], '/' ), '/' )
)
{
$baseTokens = explode( '/', $page[ 'path' ] );
array_pop( $baseTokens ); // strip basename
$baseTokens[] = $u;
$u = join( '/', $baseTokens );
}
}
if ( ! isset( $url[ 'host' ]))
{
$u = 'http://'.$page[ 'host' ].'/'.ltrim( $u, '/' );
}
return $u;
}
/**
* Maps assets URL to the DOM elements that reference them.
*
*/
private function get_urls($url){
foreach($this->selectors as $selector){
$i=0;
foreach(pq($selector) as $element){
//move on to the next iteration if link is not shortcut icon or stylesheet
if ( pq($element)->attr('rel') && !in_array( strtolower(pq($element)->attr('rel')), array('shortcut icon','stylesheet'))){
continue;
}
foreach(array('src','href') as $attr){
if ($element->hasAttribute($attr)) {
$absolute_url = $this->absolute_url(pq($element)->attr($attr), $url);
$this->asset_urls[$absolute_url]=$element;
}
}
}
}
}
/**
* Updates the DOM with the data URL its linked assets and saves the document in the current working directory.
*/
function fetch($url=null){
$url or $url = $this->url;
$this->get_urls($url);
foreach($this->asset_urls as $u => $element){
$this->multi_curl->startRequest($u,array($this,'set_data_url'),$element);
}
$this->multi_curl->finishAllRequests();
//TODO: gzip file
file_put_contents( basename($this->filename), $this->doc->html());
}
}
$p = new PageSnap($argv[1],$argv[2]);
$p->fetch();
?>
@afiore

This comment has been minimized.

Copy link
Owner Author

afiore commented Jun 21, 2010

Although this script is itself smart, a better name would make it even more!

@fulldecent

This comment has been minimized.

Copy link

fulldecent commented Dec 6, 2014

I would love a webservice that ran this, great idea

@ryangurn

This comment has been minimized.

Copy link

ryangurn commented Jan 1, 2015

can you work on a solution for css files that @import other css files? Is there any work around for that?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.