Skip to content

Instantly share code, notes, and snippets.

@ivanvc
Created October 11, 2011 17:10
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save ivanvc/1278706 to your computer and use it in GitHub Desktop.
Save ivanvc/1278706 to your computer and use it in GitHub Desktop.
Mirrors a web site by using curl to download each page.
#!/usr/local/bin/perl
#
# curlmirror.pl
#
# Mirrors a web site by using curl to download each page.
# The result is stored in a directory named "dest" by default.
# Temporary files are stored in "/tmp".
#
# Author: Kjell.Ericson@haxx.se
#
# Limitations:
# All links are right now based from the root, so there are a lot
# of "../../" in pages.
#
# History:
#
# 1999-11-19 v0.9 - Kjell Ericson - First version
# 1999-11-22 v0.10 - Kjell Ericson - Added some more flags
# 1999-12-06 v0.11 - Kjell Ericson - Relative paths were not correctecd
# 1999-12-06 v1.0 - Kjell Ericson - Satisfied and updated to v1.0
# 1999-12-07 v1.1 - Kjell Ericson - Added "-p"
# 1999-12-08 v1.2 - Kjell Ericson - Added "-l" and "-c"
# 1999-12-13 v1.3 - Kjell Ericson - Added match for images in stylesheets
# 2000-08-07 v1.4 - Kjell Ericson - Handles both ' and " in links.
# 2000-08-15 v1.5 - Kjell Ericson - Added -I.
# 2000-08-16 v1.6 - Kjell Ericson - Added multiple -I and -B.
# 2002-01-23 v1.7 - Anthony Thyssen - Changed the destination filename
# 2002-07-14 v1.8 - Kjell Ericson - Corrected a temp-filename error
#
$max_deep=1000;
$max_size=2;
$dest_dir="dest";
$default_name="index.html";
$tmp="/tmp";
$filecounter=0;
# For faster handling we have this regex:
$nonhtmlfiles="jpg|gif|png|zip|doc|txt|pdf|exe|java";
$help=
"Usage: curlmirror.pl [flags] [url]\n".
"\n".
"-a <args> : Curl specific arguments\n".
"-B <url> : Only retrieve URL below this URL (default is [url]).\n".
"-b <name> : Pattern that will be stripped from filename.\n".
"-c : Ignore CGI's (i.e URL's with '?' in them) (default off).\n".
"-d <number>: Depth to scan on (default unlimited).\n".
"-f : Flat directory structure is made (be careful).\n".
"-F : Flat directory structure but use path in filename.\n".
"-i <name> : Default name for unknown filenames (default is 'index.html').\n".
"-I <regex> : Don't handle files matching this pattern (default is \"\".\n".
"-l : Only load HTML-pages - no images (default is to load all).\n".
"-o <dir> : Directory to output result in (default is 'dest').\n".
"-s <number>: Max size in Mb of downloaded data (default 2 Mb)\n".
"-p : Always load images (default is not to).\n".
"-t <dir> : Temporary directory (default is '/tmp').\n".
"-v : Verbose output.\n".
"\n".
"Example:\n".
"curlmirror.pl http://www.perl.com/\n".
"\nAuthor: Kjell.Ericson\@haxx.se\n";
for ($i=0; $i<=$#ARGV; $i++) {
$arg=$ARGV[$i];
if ($arg =~ s/^-//) {
if ($arg =~ m/\?/) {
print $help;
exit();
}
if ($arg =~ m/a/) {
$curl_args=$ARGV[++$i];
}
if ($arg =~ m/B/) {
$base=$ARGV[++$i];
if ($base !~ m/(http:\/\/[^\/]*)/i) {
print "***Malformed -B(ase)\n";
die($help);
}
$base=~ s/([+*~^()\\])/\\$1/g; # escape chars
push @basematch, $base;
}
if ($arg =~ m/I/) {
push @ignorepatt, $ARGV[++$i];
}
if ($arg =~ m/b/) {
$strip_from_file=$ARGV[++$i];
}
if ($arg =~ m/c/) {
$ignore_cgi=1;
}
if ($arg =~ m/d/) {
$max_deep=$ARGV[++$i];
}
if ($arg =~ m/o/) {
$dest_dir=$ARGV[++$i];
$dest_dir=~ s/\/$//g;
}
if ($arg =~ m/t/) {
$tmp=$ARGV[++$i];
$tmp=~ s/\/$//g;
}
if ($arg =~ m/s/) {
$max_size=$ARGV[++$i];
}
if ($arg =~ m/i/) {
$default_name=$ARGV[++$i];
}
if ($arg =~ m/l/) {
$only_html=1;
}
if ($arg =~ m/v/) {
$verbose=1;
}
if ($arg =~ m/p/) {
$picture_load=1;
}
if ($arg =~ m/f/) {
$flat=1;
}
if ($arg =~ m/F/) {
$flat=2;
}
} else { #default
$start=$arg;
}
}
$curl="curl -s $curl_args ";
if ($base eq "") {
if ($start !~ m/(http:\/\/.+\/)/i) {
if ($start =~ m/(http:\/\/.+)/i) {
$start.="/";
} else {
print "***Malformed start URL ($start)\n";
die($help);
}
}
$base=$start;
$base=~ s/\/[^\/]+$/\//; # strip docname
$base=~ s/([+*~^()\\])/\\$1/g; # escape chars
$basematch[0]=$base;
}
$follow_link{"start"}=0;
$linktmp="[ \n\r]*=[ \r\n]*)([\"'][^\"']*[\"']|[^ )>]*)";
%follow=(
"(<[^>]*a[^>]+href$linktmp", "link",
"(<[^>]*area[^>]+href$linktmp", "link",
"(<[^>]*frame[^>]+src$linktmp", "link",
);
if ($only_html == 0) {
%follow=(%follow,
"(BODY[^>]*\{[^}>]*background-image:[^>}]*url[(])([^\}>\) ]+)", "img", # for stylesheets
"(<[^>]*img[^>]+src$linktmp", "img",
"(<[^>]*body[^>]+background$linktmp", "img",
"(<[^>]*applet[^>]+archive$linktmp", "archive",
"(<[^>]*td[^>]+background$linktmp", "img",
"(<[^>]*tr[^>]+background$linktmp", "img",
"(<[^>]*table[^>]+background$linktmp", "img",
);
}
$deep=0;
$found=1;
while ($found && $deep<$max_deep) {
$found=0;
foreach $url (keys %follow_link) {
$current_depth=$follow_link{$url};
# print STDERR ">$url $current_depth\n";
if ($current_depth == $deep && $current_depth>=0 &&
$total_size<$max_size*1024*1024) {
$found=1;
$current_depth++;
if ($url eq "start") {
delete $follow_link{$url};
$url=$start;
$url="stdin" if ($url eq "");
$start="";
}
$follow_link{$url}=-1;
$stop=0;
$status_code=0;
$content_type="";
$real_url=$url;
$real_url=~s/#(.*)//; # strip bookmarks before loading
if ( $url !~ m/[ \n\r]/) {
$filecounter++;
$this_file_name="$filecounter..$real_url";
$this_file_name =~ s/%([a-fA-F0-9][a-fA-F0-9])/chr hex $1/eg;
#$this_file_name=~ s/[^a-zA-Z0-9.]+/_/g;
$this_file_name=~ s/[^\w\d]+/_/g;
$content_type="";
print STDERR "Get $deep:$url\n" if ($verbose);
$head=`$curl -D - -o "$tmp/$this_file_name" "$real_url"`;
$filenames{$real_url}=$this_file_name;
if ($head =~ m/Location: *["]?(.*)["]?/i) {
$loc=$1;
$loc=~ s/[\r\n]//g;
$loc=merge_urls($real_url, $loc);
if (accept_url($loc) ||
($picture_load && $linktype{$real_url} eq "img")) {
`rm "$tmp/$this_file_name"`;
delete $filenames{$real_url};
$real_url=$loc;
$url=$loc;
print STDERR "Reget $deep:$url\n" if ($verbose);
$head=`$curl -D - -o "$tmp/$this_file_name" "$real_url"`;
$filenames{$real_url}=$this_file_name;
$follow_link{$real_url}=-1;
}
}
$total_size+=-s "$tmp/$this_file_name";
if ($head =~ m/^HTTP[^\n\r]* ([0-9]+) ([^\n\r]*)/s) {
$status_code=$1;#." ".$2;
}
if ($head =~ m/[\n\r]Content-Type:(.*)[\r\n]/si) {
$content_type=$1;
}
$linktype{$real_url}=$content_type;
if ($content_type !~ m/html/i) {
if ($only_html) { # remove this file
$total_size-=-s "$tmp/$this_file_name";
`rm "$tmp/$this_file_name"`;
delete $filenames{$real_url};
}
} else {
$text=`cat "$tmp/$this_file_name"`;
if ($current_depth<$max_deep) {
$linktype{$real_url}="html";
$text="" if ($url =~ m/\#/);
foreach $search (keys %follow) {
while ($text =~ s/$search//si) {
$link=$2;
$link=~ s/[\"\']//g;
$link=~ s/#.*//;
$newurl=merge_urls($url, $link);
if ($ignore_cgi==0 || $newurl !~ m/\?/) {
if (accept_url($newurl) ||
($picture_load && $follow{$search} eq "img")) {
if (!exists $follow_link{$newurl}) {
if ($only_html == 0 ||
$newurl !~ m/\.($nonhtmlfiles)$/i) {
$follow_link{$newurl}=$current_depth;
}
}
}
}
}
}
}
}
}
}
}
$deep++;
}
print STDERR "Max size exceeded ($total_size bytes)!\n" if ($total_size>=$max_size*1024*1024);
print STDERR "Total size loaded:$total_size bytes\n" if ($verbose);
foreach $url (keys %filenames) {
local $destname=$url;
$destname=~ s/$basematch[0]//;
local $destdir=$destname;
$destdir="" if ($destdir !~ m/\//);
$destdir =~ s/\/[^\/]*$/\//;
$destname=~ s/^.*\///g;
$destname=~s/#(.*)//;
local $bookmark=$1;
$destname=~ s/[^a-zA-Z0-9.]/_/g; # strip chars we don't want in a filename
$destdir=~ s/$strip_from_file// if ($strip_from_file ne "");
$destdir=~ s/^([^\/]+):\/\//$1_/;
$destdir=~ s/[^a-zA-Z0-9.\/_]/_/g;
$destdir=~ s/(^\/)|(\/$)//g; # strip trailing/leading slashes
if ($flat) {
if ($flat==2) {
$destdir=~ s/[\/:]/_/g;
$destdir.="_";
} else {
$destdir="";
}
`mkdir -p "$dest_dir"`;
} else {
local $tmp="$dest_dir/$destdir";
$tmp=~ s/^\///g;
`mkdir -p "$tmp"`;
$destdir.="/" if ($destdir ne "");
}
$destname=$default_name if ($destname eq "");
$destname=$destdir.$destname if ($destdir ne "");
if (($linktype{$url} =~ m/html/i) && ($destname !~ m/\.[s]?htm/i)) {
$destname.=".html";
}
$destfile{$url}=$destname;
}
foreach $url (keys %filenames) {
$name=$filenames{$url};
$destname=$destfile{$url};
if ($linktype{$url} !~ m/html/) {
`mv "$tmp/$name" "$dest_dir/$destname"`;
} else {
$text=`cat "$tmp/$name"`;
foreach $search (keys %follow) {
$text=~ s/$search/"$1\"".make_file_relative($url,merge_urls($url, $2))."\""/sgie;
}
if (open(OUT, ">$dest_dir/$destname")) {
print OUT $text;
close(OUT);
} else {
print STDERR "Couldn't save file '$dest_dir/$destname'\n";
}
`rm "$tmp/$name"`;
}
}
# Input: Base-URL, MakeRelative-URL
#
# Function: Convert and return "MakeRelativ-URL" to be relative
# to "Base-URL".
#
sub make_file_relative
{
local ($from, $to)=@_;
local $result="";
local $sourcename=$destfile{$from};
local $destname;
local $bookmark;
if ($to=~ s/(\#.*)$//) { # extract bookmarks
$bookmark=$1;
}
$destname=$destfile{$to};
if ($destname eq "") {
return $to.$bookmark
}
$sourcename="" if ($sourcename !~ m/\//);
$sourcename=~ s/\/[^\/]*$/\//; #strip filename
do {
$sourcename=~ m/^([^\/]*\/)/;
local $dir=$1;
if ($dir ne "") {
$dir=~ s/([*.\\\/\[\]()+|])/\\$1/g;
if ($destname =~ s/^$dir//) {
$sourcename=~ s/^$dir//;
} else {
$dir="";
}
}
} while ($dir ne "");
$sourcename=~ s/[^\/]+\//..\//g; # Relative it with some ../
$result="$sourcename$destname";
$result=~ s/^\///g;
return $result.$bookmark;
}
# Function: If you are viewing location "$base" which is a full URL, and
# click on "$new" that can be full or relative - where do you get? That
# is what this function returns.
#
# Input: base-URL, new-URL (where to go)
# Returns: a full format new-URL (without bookmark)
#
sub merge_urls
{
local ($org, $new)=@_;
local $url, $new;
$new =~ s/[\"\']//g;
if ($new =~ m/.*:/) {
$url=$new;
} elsif ($new eq "") {
$url=$org;
} else {
if ($org =~ m/(.*):\/\/([^\/]*)(.*)$/) {
local $prot=$1;
local $server=$2;
local $pathanddoc=$3;
local $path;
local $doc=$3;
if ($pathanddoc=~ m/^(.*)\/(.*)$/) {
$path=$1;
$doc=$2;
}
$doc=~s/#(.*)//;
local $bookmark=$1;
if ($new =~ m/^#/) {
$url="$prot://$server$path/$doc$new";
} elsif ($new =~ m/^\//) {
$url="$prot://$server$new";
} else {
$url="$prot://$server$path/$new";
while ($url =~ s/\/[^\/]*\/\.\.\//\//){}
while ($url =~ s/\.\///){};
}
}
}
return $url;
}
sub accept_url
{
local ($url)=@_;
local $ret=0;
print "test $url\n";
for (@basematch) {
if ($url =~ m/$_/) {
$ret=1;
}
}
return 0 if ($ret == 0); # No basematch
for (@ignorepatt) {
if ($url =~ m/$_/) {
return 0;
}
}
print "match $url\n";
return 1;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment