ivanvc/curlmirror

## curlmirror.pl
#!/usr/local/bin/perl
#
# curlmirror.pl
#
# Mirrors a web site by using curl to download each page.
# The result is stored in a directory named "dest" by default.
# Temporary files are stored in "/tmp".
#
# Author: Kjell.Ericson@haxx.se
#
# Limitations:
#   All links are right now based from the root, so there are a lot
#   of "../../" in pages.
#
# History:
#
# 1999-11-19 v0.9 - Kjell Ericson - First version
# 1999-11-22 v0.10 - Kjell Ericson - Added some more flags
# 1999-12-06 v0.11 - Kjell Ericson - Relative paths were not correctecd
# 1999-12-06 v1.0  - Kjell Ericson - Satisfied and updated to v1.0
# 1999-12-07 v1.1  - Kjell Ericson - Added "-p"
# 1999-12-08 v1.2  - Kjell Ericson - Added "-l" and "-c"
# 1999-12-13 v1.3  - Kjell Ericson - Added match for images in stylesheets
# 2000-08-07 v1.4  - Kjell Ericson - Handles both ' and " in links.
# 2000-08-15 v1.5  - Kjell Ericson - Added -I.
# 2000-08-16 v1.6  - Kjell Ericson - Added multiple -I and -B.
# 2002-01-23 v1.7  - Anthony Thyssen - Changed the destination filename
# 2002-07-14 v1.8  - Kjell Ericson - Corrected a temp-filename error
#
$max_deep=1000;
$max_size=2;

$dest_dir="dest";
$default_name="index.html";
$tmp="/tmp";
$filecounter=0;

# For faster handling we have this regex:
$nonhtmlfiles="jpg|gif|png|zip|doc|txt|pdf|exe|java";

$help=
    "Usage: curlmirror.pl [flags] [url]\n".
    "\n".
    "-a <args>  : Curl specific arguments\n".
    "-B <url>   : Only retrieve URL below this URL (default is [url]).\n".
    "-b <name>  : Pattern that will be stripped from filename.\n".
    "-c         : Ignore CGI's (i.e URL's with '?' in them) (default off).\n".
    "-d <number>: Depth to scan on (default unlimited).\n".
    "-f         : Flat directory structure is made (be careful).\n".
    "-F         : Flat directory structure but use path in filename.\n".
    "-i <name>  : Default name for unknown filenames (default is 'index.html').\n".
    "-I <regex> : Don't handle files matching this pattern (default is \"\".\n".
    "-l         : Only load HTML-pages - no images (default is to load all).\n".
    "-o <dir>   : Directory to output result in (default is 'dest').\n".
    "-s <number>: Max size in Mb of downloaded data (default 2 Mb)\n".
    "-p         : Always load images (default is not to).\n".
    "-t <dir>   : Temporary directory (default is '/tmp').\n".
    "-v         : Verbose output.\n".
    "\n".
    "Example:\n".
    "curlmirror.pl http://www.perl.com/\n".
    "\nAuthor: Kjell.Ericson\@haxx.se\n";

for ($i=0; $i<=$#ARGV; $i++) {
    $arg=$ARGV[$i];
    if ($arg =~ s/^-//) {
        if ($arg =~ m/\?/) {
            print $help;
            exit();
        }
        if ($arg =~ m/a/) {
            $curl_args=$ARGV[++$i];
        }
        if ($arg =~ m/B/) {
            $base=$ARGV[++$i];
            if ($base !~ m/(http:\/\/[^\/]*)/i) {
                print "***Malformed -B(ase)\n";
                die($help);
            }
            $base=~ s/([+*~^()\\])/\\$1/g; # escape chars
            push @basematch, $base;
        }
        if ($arg =~ m/I/) {
            push @ignorepatt, $ARGV[++$i];
        }
        if ($arg =~ m/b/) {
            $strip_from_file=$ARGV[++$i];
        }
        if ($arg =~ m/c/) {
            $ignore_cgi=1;
        }
        if ($arg =~ m/d/) {
            $max_deep=$ARGV[++$i];
        }
        if ($arg =~ m/o/) {
            $dest_dir=$ARGV[++$i];
            $dest_dir=~ s/\/$//g;
        }
        if ($arg =~ m/t/) {
            $tmp=$ARGV[++$i];
            $tmp=~ s/\/$//g;
        }
        if ($arg =~ m/s/) {
            $max_size=$ARGV[++$i];
        }
        if ($arg =~ m/i/) {
            $default_name=$ARGV[++$i];
        }
        if ($arg =~ m/l/) {
            $only_html=1;
        }
        if ($arg =~ m/v/) {
            $verbose=1;
        }
        if ($arg =~ m/p/) {
            $picture_load=1;
        }
        if ($arg =~ m/f/) {
            $flat=1;
        }
        if ($arg =~ m/F/) {
            $flat=2;
        }
    } else { #default
        $start=$arg;
    }
}

$curl="curl -s $curl_args ";

if ($base eq "") {
    if ($start !~ m/(http:\/\/.+\/)/i) {
        if ($start =~ m/(http:\/\/.+)/i) {
            $start.="/";
        } else {
            print "***Malformed start URL ($start)\n";
            die($help);
        }
    }
    $base=$start;
    $base=~ s/\/[^\/]+$/\//; # strip docname
    $base=~ s/([+*~^()\\])/\\$1/g; # escape chars
    $basematch[0]=$base;
}


$follow_link{"start"}=0;

$linktmp="[ \n\r]*=[ \r\n]*)([\"'][^\"']*[\"']|[^ )>]*)";
%follow=(
         "(<[^>]*a[^>]+href$linktmp", "link",
         "(<[^>]*area[^>]+href$linktmp", "link",
         "(<[^>]*frame[^>]+src$linktmp", "link",
         );
if ($only_html == 0) {
    %follow=(%follow,
             "(BODY[^>]*\{[^}>]*background-image:[^>}]*url[(])([^\}>\) ]+)", "img", # for stylesheets
             "(<[^>]*img[^>]+src$linktmp", "img",
             "(<[^>]*body[^>]+background$linktmp", "img",
             "(<[^>]*applet[^>]+archive$linktmp", "archive",
             "(<[^>]*td[^>]+background$linktmp", "img",
             "(<[^>]*tr[^>]+background$linktmp", "img",
             "(<[^>]*table[^>]+background$linktmp", "img",
             );
}

$deep=0;
$found=1;
while ($found && $deep<$max_deep) {
    $found=0;
    foreach $url (keys %follow_link) {
        $current_depth=$follow_link{$url};
#        print STDERR ">$url $current_depth\n";
        if ($current_depth == $deep && $current_depth>=0 &&
            $total_size<$max_size*1024*1024) {
            $found=1;
            $current_depth++;
            if ($url eq "start") {
                delete $follow_link{$url};
                $url=$start;
                $url="stdin" if ($url eq "");
                $start="";
            }
            $follow_link{$url}=-1;
            $stop=0;

            $status_code=0;
            $content_type="";
            $real_url=$url;
            $real_url=~s/#(.*)//; # strip bookmarks before loading
            if ( $url !~ m/[ \n\r]/) {
                $filecounter++;
                $this_file_name="$filecounter..$real_url";
                $this_file_name =~ s/%([a-fA-F0-9][a-fA-F0-9])/chr hex $1/eg;
                #$this_file_name=~ s/[^a-zA-Z0-9.]+/_/g;
                $this_file_name=~ s/[^\w\d]+/_/g;
                $content_type="";

                print STDERR "Get $deep:$url\n" if ($verbose);
                $head=`$curl -D - -o "$tmp/$this_file_name" "$real_url"`;
                $filenames{$real_url}=$this_file_name;
                if ($head =~ m/Location: *["]?(.*)["]?/i) {
                    $loc=$1;
                    $loc=~ s/[\r\n]//g;
                    $loc=merge_urls($real_url, $loc);
                    if (accept_url($loc) ||
                        ($picture_load && $linktype{$real_url} eq "img")) {
                        `rm "$tmp/$this_file_name"`;
                        delete $filenames{$real_url};
                        $real_url=$loc;
                        $url=$loc;
                        print STDERR "Reget $deep:$url\n" if ($verbose);
                        $head=`$curl -D - -o "$tmp/$this_file_name" "$real_url"`;
                        $filenames{$real_url}=$this_file_name;
                        $follow_link{$real_url}=-1;
                    }
                }
                $total_size+=-s "$tmp/$this_file_name";

                if ($head =~ m/^HTTP[^\n\r]* ([0-9]+) ([^\n\r]*)/s) {
                    $status_code=$1;#." ".$2;
                }
                if ($head =~ m/[\n\r]Content-Type:(.*)[\r\n]/si) {
                    $content_type=$1;
                }
                $linktype{$real_url}=$content_type;
                if ($content_type !~ m/html/i) {
                    if ($only_html) { # remove this file
                        $total_size-=-s "$tmp/$this_file_name";
                        `rm "$tmp/$this_file_name"`;
                        delete $filenames{$real_url};
                    }
                } else {
                    $text=`cat "$tmp/$this_file_name"`;
                    if ($current_depth<$max_deep) {
                        $linktype{$real_url}="html";
                        $text="" if ($url =~ m/\#/);
                        foreach $search (keys %follow) {
                            while ($text =~ s/$search//si) {
                                $link=$2;
                                $link=~ s/[\"\']//g;
                                $link=~ s/#.*//;
                                $newurl=merge_urls($url, $link);
                                if ($ignore_cgi==0 || $newurl !~ m/\?/) {
                                    if (accept_url($newurl) ||
                                        ($picture_load && $follow{$search} eq "img")) {
                                        if (!exists $follow_link{$newurl}) {
                                            if ($only_html == 0 ||
                                                $newurl !~ m/\.($nonhtmlfiles)$/i) {
                                                $follow_link{$newurl}=$current_depth;
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
    }
    $deep++;
}
print STDERR "Max size exceeded ($total_size bytes)!\n" if ($total_size>=$max_size*1024*1024);
print STDERR "Total size loaded:$total_size bytes\n" if ($verbose);

foreach $url (keys %filenames) {
    local $destname=$url;
    $destname=~ s/$basematch[0]//;
    local $destdir=$destname;

    $destdir="" if ($destdir !~ m/\//);
    $destdir =~ s/\/[^\/]*$/\//;

    $destname=~ s/^.*\///g;
    $destname=~s/#(.*)//;
    local $bookmark=$1;

    $destname=~ s/[^a-zA-Z0-9.]/_/g; # strip chars we don't want in a filename
    $destdir=~ s/$strip_from_file// if ($strip_from_file ne "");
    $destdir=~ s/^([^\/]+):\/\//$1_/;
    $destdir=~ s/[^a-zA-Z0-9.\/_]/_/g;
    $destdir=~ s/(^\/)|(\/$)//g; # strip trailing/leading slashes

    if ($flat) {
        if ($flat==2) {
            $destdir=~ s/[\/:]/_/g;
            $destdir.="_";
        } else {
            $destdir="";
        }
        `mkdir -p "$dest_dir"`;
    } else {
        local $tmp="$dest_dir/$destdir";
        $tmp=~ s/^\///g;
        `mkdir -p "$tmp"`;
        $destdir.="/" if ($destdir ne "");
    }
    $destname=$default_name if ($destname eq "");
    $destname=$destdir.$destname if ($destdir ne "");
    if (($linktype{$url} =~ m/html/i) && ($destname !~ m/\.[s]?htm/i)) {
        $destname.=".html";
    }
    $destfile{$url}=$destname;
}

foreach $url (keys %filenames) {
    $name=$filenames{$url};
    $destname=$destfile{$url};

    if ($linktype{$url} !~ m/html/) {
        `mv "$tmp/$name" "$dest_dir/$destname"`;
    } else {
        $text=`cat "$tmp/$name"`;
        foreach $search (keys %follow) {
            $text=~ s/$search/"$1\"".make_file_relative($url,merge_urls($url, $2))."\""/sgie;
        }
        if (open(OUT, ">$dest_dir/$destname")) {
            print OUT $text;
            close(OUT);
        } else {
            print STDERR "Couldn't save file '$dest_dir/$destname'\n";
        }
        `rm "$tmp/$name"`;
    }
}

# Input: Base-URL, MakeRelative-URL
#
# Function: Convert and return "MakeRelativ-URL" to be relative
# to "Base-URL".
#
sub make_file_relative
{
    local ($from, $to)=@_;
    local $result="";
    local $sourcename=$destfile{$from};
    local $destname;
    local $bookmark;


    if ($to=~ s/(\#.*)$//) { # extract bookmarks
        $bookmark=$1;
    }

    $destname=$destfile{$to};

    if ($destname eq "") {
        return $to.$bookmark
    }


    $sourcename="" if ($sourcename !~ m/\//);

    $sourcename=~ s/\/[^\/]*$/\//; #strip filename
    do {
        $sourcename=~ m/^([^\/]*\/)/;
        local $dir=$1;
        if ($dir ne "") {
            $dir=~ s/([*.\\\/\[\]()+|])/\\$1/g;
            if ($destname =~ s/^$dir//) {
                $sourcename=~ s/^$dir//;
            } else {
                $dir="";
            }
        }
    } while ($dir ne "");
    $sourcename=~ s/[^\/]+\//..\//g; # Relative it with some ../

    $result="$sourcename$destname";
    $result=~ s/^\///g;

    return $result.$bookmark;
}


# Function: If you are viewing location "$base" which is a full URL, and
# click on "$new" that can be full or relative - where do you get? That
# is what this function returns.
#
# Input: base-URL, new-URL (where to go)
# Returns: a full format new-URL (without bookmark)
#
sub merge_urls
{
    local ($org, $new)=@_;
    local $url, $new;

    $new =~ s/[\"\']//g;

    if ($new =~ m/.*:/) {
        $url=$new;
    } elsif ($new eq "") {
        $url=$org;
    } else {
        if ($org =~ m/(.*):\/\/([^\/]*)(.*)$/) {
            local $prot=$1;
            local $server=$2;
            local $pathanddoc=$3;
            local $path;
            local $doc=$3;
            if ($pathanddoc=~ m/^(.*)\/(.*)$/) {
                $path=$1;
                $doc=$2;
            }
            $doc=~s/#(.*)//;
            local $bookmark=$1;

            if ($new =~ m/^#/) {
                $url="$prot://$server$path/$doc$new";
            } elsif ($new =~ m/^\//) {
                $url="$prot://$server$new";
            } else {
                $url="$prot://$server$path/$new";
                while ($url =~ s/\/[^\/]*\/\.\.\//\//){}
                while ($url =~ s/\.\///){};
            }
        }

    }
    return $url;
}

sub accept_url
{
    local ($url)=@_;
    local $ret=0;
    print "test  $url\n";
    for (@basematch) {
        if ($url =~ m/$_/) {
            $ret=1;
        }
    }
    return 0 if ($ret == 0); # No basematch

    for (@ignorepatt) {
        if ($url =~ m/$_/) {
            return 0;
        }
    }
    print "match $url\n";
    return 1;
}
	#!/usr/local/bin/perl
	#
	# curlmirror.pl
	#
	# Mirrors a web site by using curl to download each page.
	# The result is stored in a directory named "dest" by default.
	# Temporary files are stored in "/tmp".
	#
	# Author: Kjell.Ericson@haxx.se
	#
	# Limitations:
	# All links are right now based from the root, so there are a lot
	# of "../../" in pages.
	#
	# History:
	#
	# 1999-11-19 v0.9 - Kjell Ericson - First version
	# 1999-11-22 v0.10 - Kjell Ericson - Added some more flags
	# 1999-12-06 v0.11 - Kjell Ericson - Relative paths were not correctecd
	# 1999-12-06 v1.0 - Kjell Ericson - Satisfied and updated to v1.0
	# 1999-12-07 v1.1 - Kjell Ericson - Added "-p"
	# 1999-12-08 v1.2 - Kjell Ericson - Added "-l" and "-c"
	# 1999-12-13 v1.3 - Kjell Ericson - Added match for images in stylesheets
	# 2000-08-07 v1.4 - Kjell Ericson - Handles both ' and " in links.
	# 2000-08-15 v1.5 - Kjell Ericson - Added -I.
	# 2000-08-16 v1.6 - Kjell Ericson - Added multiple -I and -B.
	# 2002-01-23 v1.7 - Anthony Thyssen - Changed the destination filename
	# 2002-07-14 v1.8 - Kjell Ericson - Corrected a temp-filename error
	#
	$max_deep=1000;
	$max_size=2;

	$dest_dir="dest";
	$default_name="index.html";
	$tmp="/tmp";
	$filecounter=0;

	# For faster handling we have this regex:
	$nonhtmlfiles="jpg\|gif\|png\|zip\|doc\|txt\|pdf\|exe\|java";

	$help=
	"Usage: curlmirror.pl [flags] [url]\n".
	"\n".
	"-a <args> : Curl specific arguments\n".
	"-B <url> : Only retrieve URL below this URL (default is [url]).\n".
	"-b <name> : Pattern that will be stripped from filename.\n".
	"-c : Ignore CGI's (i.e URL's with '?' in them) (default off).\n".
	"-d <number>: Depth to scan on (default unlimited).\n".
	"-f : Flat directory structure is made (be careful).\n".
	"-F : Flat directory structure but use path in filename.\n".
	"-i <name> : Default name for unknown filenames (default is 'index.html').\n".
	"-I <regex> : Don't handle files matching this pattern (default is \"\".\n".
	"-l : Only load HTML-pages - no images (default is to load all).\n".
	"-o <dir> : Directory to output result in (default is 'dest').\n".
	"-s <number>: Max size in Mb of downloaded data (default 2 Mb)\n".
	"-p : Always load images (default is not to).\n".
	"-t <dir> : Temporary directory (default is '/tmp').\n".
	"-v : Verbose output.\n".
	"\n".
	"Example:\n".
	"curlmirror.pl http://www.perl.com/\n".
	"\nAuthor: Kjell.Ericson\@haxx.se\n";

	for ($i=0; $i<=$#ARGV; $i++) {
	$arg=$ARGV[$i];
	if ($arg =~ s/^-//) {
	if ($arg =~ m/\?/) {
	print $help;
	exit();
	}
	if ($arg =~ m/a/) {
	$curl_args=$ARGV[++$i];
	}
	if ($arg =~ m/B/) {
	$base=$ARGV[++$i];
	if ($base !~ m/(http:\/\/[^\/]*)/i) {
	print "***Malformed -B(ase)\n";
	die($help);
	}
	$base=~ s/([+*~^()\\])/\\$1/g; # escape chars
	push @basematch, $base;
	}
	if ($arg =~ m/I/) {
	push @ignorepatt, $ARGV[++$i];
	}
	if ($arg =~ m/b/) {
	$strip_from_file=$ARGV[++$i];
	}
	if ($arg =~ m/c/) {
	$ignore_cgi=1;
	}
	if ($arg =~ m/d/) {
	$max_deep=$ARGV[++$i];
	}
	if ($arg =~ m/o/) {
	$dest_dir=$ARGV[++$i];
	$dest_dir=~ s/\/$//g;
	}
	if ($arg =~ m/t/) {
	$tmp=$ARGV[++$i];
	$tmp=~ s/\/$//g;
	}
	if ($arg =~ m/s/) {
	$max_size=$ARGV[++$i];
	}
	if ($arg =~ m/i/) {
	$default_name=$ARGV[++$i];
	}
	if ($arg =~ m/l/) {
	$only_html=1;
	}
	if ($arg =~ m/v/) {
	$verbose=1;
	}
	if ($arg =~ m/p/) {
	$picture_load=1;
	}
	if ($arg =~ m/f/) {
	$flat=1;
	}
	if ($arg =~ m/F/) {
	$flat=2;
	}
	} else { #default
	$start=$arg;
	}
	}

	$curl="curl -s $curl_args ";

	if ($base eq "") {
	if ($start !~ m/(http:\/\/.+\/)/i) {
	if ($start =~ m/(http:\/\/.+)/i) {
	$start.="/";
	} else {
	print "***Malformed start URL ($start)\n";
	die($help);
	}
	}
	$base=$start;
	$base=~ s/\/[^\/]+$/\//; # strip docname
	$base=~ s/([+*~^()\\])/\\$1/g; # escape chars
	$basematch[0]=$base;
	}


	$follow_link{"start"}=0;

	$linktmp="[ \n\r]=[ \r\n])([\"'][^\"'][\"']\|[^ )>])";
	%follow=(
	"(<[^>]*a[^>]+href$linktmp", "link",
	"(<[^>]*area[^>]+href$linktmp", "link",
	"(<[^>]*frame[^>]+src$linktmp", "link",
	);
	if ($only_html == 0) {
	%follow=(%follow,
	"(BODY[^>]\{[^}>]background-image:[^>}]*url[(])([^\}>\) ]+)", "img", # for stylesheets
	"(<[^>]*img[^>]+src$linktmp", "img",
	"(<[^>]*body[^>]+background$linktmp", "img",
	"(<[^>]*applet[^>]+archive$linktmp", "archive",
	"(<[^>]*td[^>]+background$linktmp", "img",
	"(<[^>]*tr[^>]+background$linktmp", "img",
	"(<[^>]*table[^>]+background$linktmp", "img",
	);
	}

	$deep=0;
	$found=1;
	while ($found && $deep<$max_deep) {
	$found=0;
	foreach $url (keys %follow_link) {
	$current_depth=$follow_link{$url};
	# print STDERR ">$url $current_depth\n";
	if ($current_depth == $deep && $current_depth>=0 &&
	$total_size<$max_size10241024) {
	$found=1;
	$current_depth++;
	if ($url eq "start") {
	delete $follow_link{$url};
	$url=$start;
	$url="stdin" if ($url eq "");
	$start="";
	}
	$follow_link{$url}=-1;
	$stop=0;

	$status_code=0;
	$content_type="";
	$real_url=$url;
	$real_url=~s/#(.*)//; # strip bookmarks before loading
	if ( $url !~ m/[ \n\r]/) {
	$filecounter++;
	$this_file_name="$filecounter..$real_url";
	$this_file_name =~ s/%([a-fA-F0-9][a-fA-F0-9])/chr hex $1/eg;
	#$this_file_name=~ s/[^a-zA-Z0-9.]+/_/g;
	$this_file_name=~ s/[^\w\d]+/_/g;
	$content_type="";

	print STDERR "Get $deep:$url\n" if ($verbose);
	$head=`$curl -D - -o "$tmp/$this_file_name" "$real_url"`;
	$filenames{$real_url}=$this_file_name;
	if ($head =~ m/Location: ["]?(.)["]?/i) {
	$loc=$1;
	$loc=~ s/[\r\n]//g;
	$loc=merge_urls($real_url, $loc);
	if (accept_url($loc) \|\|
	($picture_load && $linktype{$real_url} eq "img")) {
	`rm "$tmp/$this_file_name"`;
	delete $filenames{$real_url};
	$real_url=$loc;
	$url=$loc;
	print STDERR "Reget $deep:$url\n" if ($verbose);
	$head=`$curl -D - -o "$tmp/$this_file_name" "$real_url"`;
	$filenames{$real_url}=$this_file_name;
	$follow_link{$real_url}=-1;
	}
	}
	$total_size+=-s "$tmp/$this_file_name";

	if ($head =~ m/^HTTP[^\n\r]* ([0-9]+) ([^\n\r]*)/s) {
	$status_code=$1;#." ".$2;
	}
	if ($head =~ m/[\n\r]Content-Type:(.*)[\r\n]/si) {
	$content_type=$1;
	}
	$linktype{$real_url}=$content_type;
	if ($content_type !~ m/html/i) {
	if ($only_html) { # remove this file
	$total_size-=-s "$tmp/$this_file_name";
	`rm "$tmp/$this_file_name"`;
	delete $filenames{$real_url};
	}
	} else {
	$text=`cat "$tmp/$this_file_name"`;
	if ($current_depth<$max_deep) {
	$linktype{$real_url}="html";
	$text="" if ($url =~ m/\#/);
	foreach $search (keys %follow) {
	while ($text =~ s/$search//si) {
	$link=$2;
	$link=~ s/[\"\']//g;
	$link=~ s/#.*//;
	$newurl=merge_urls($url, $link);
	if ($ignore_cgi==0 \|\| $newurl !~ m/\?/) {
	if (accept_url($newurl) \|\|
	($picture_load && $follow{$search} eq "img")) {
	if (!exists $follow_link{$newurl}) {
	if ($only_html == 0 \|\|
	$newurl !~ m/\.($nonhtmlfiles)$/i) {
	$follow_link{$newurl}=$current_depth;
	}
	}
	}
	}
	}
	}
	}
	}
	}
	}
	}
	$deep++;
	}
	print STDERR "Max size exceeded ($total_size bytes)!\n" if ($total_size>=$max_size10241024);
	print STDERR "Total size loaded:$total_size bytes\n" if ($verbose);

	foreach $url (keys %filenames) {
	local $destname=$url;
	$destname=~ s/$basematch[0]//;
	local $destdir=$destname;

	$destdir="" if ($destdir !~ m/\//);
	$destdir =~ s/\/[^\/]*$/\//;

	$destname=~ s/^.*\///g;
	$destname=~s/#(.*)//;
	local $bookmark=$1;

	$destname=~ s/[^a-zA-Z0-9.]/_/g; # strip chars we don't want in a filename
	$destdir=~ s/$strip_from_file// if ($strip_from_file ne "");
	$destdir=~ s/^([^\/]+):\/\//$1_/;
	$destdir=~ s/[^a-zA-Z0-9.\/_]/_/g;
	$destdir=~ s/(^\/)\|(\/$)//g; # strip trailing/leading slashes

	if ($flat) {
	if ($flat==2) {
	$destdir=~ s/[\/:]/_/g;
	$destdir.="_";
	} else {
	$destdir="";
	}
	`mkdir -p "$dest_dir"`;
	} else {
	local $tmp="$dest_dir/$destdir";
	$tmp=~ s/^\///g;
	`mkdir -p "$tmp"`;
	$destdir.="/" if ($destdir ne "");
	}
	$destname=$default_name if ($destname eq "");
	$destname=$destdir.$destname if ($destdir ne "");
	if (($linktype{$url} =~ m/html/i) && ($destname !~ m/\.[s]?htm/i)) {
	$destname.=".html";
	}
	$destfile{$url}=$destname;
	}

	foreach $url (keys %filenames) {
	$name=$filenames{$url};
	$destname=$destfile{$url};

	if ($linktype{$url} !~ m/html/) {
	`mv "$tmp/$name" "$dest_dir/$destname"`;
	} else {
	$text=`cat "$tmp/$name"`;
	foreach $search (keys %follow) {
	$text=~ s/$search/"$1\"".make_file_relative($url,merge_urls($url, $2))."\""/sgie;
	}
	if (open(OUT, ">$dest_dir/$destname")) {
	print OUT $text;
	close(OUT);
	} else {
	print STDERR "Couldn't save file '$dest_dir/$destname'\n";
	}
	`rm "$tmp/$name"`;
	}
	}

	# Input: Base-URL, MakeRelative-URL
	#
	# Function: Convert and return "MakeRelativ-URL" to be relative
	# to "Base-URL".
	#
	sub make_file_relative
	{
	local ($from, $to)=@_;
	local $result="";
	local $sourcename=$destfile{$from};
	local $destname;
	local $bookmark;


	if ($to=~ s/(\#.*)$//) { # extract bookmarks
	$bookmark=$1;
	}

	$destname=$destfile{$to};

	if ($destname eq "") {
	return $to.$bookmark
	}


	$sourcename="" if ($sourcename !~ m/\//);

	$sourcename=~ s/\/[^\/]*$/\//; #strip filename
	do {
	$sourcename=~ m/^([^\/]*\/)/;
	local $dir=$1;
	if ($dir ne "") {
	$dir=~ s/([*.\\\/\[\]()+\|])/\\$1/g;
	if ($destname =~ s/^$dir//) {
	$sourcename=~ s/^$dir//;
	} else {
	$dir="";
	}
	}
	} while ($dir ne "");
	$sourcename=~ s/[^\/]+\//..\//g; # Relative it with some ../

	$result="$sourcename$destname";
	$result=~ s/^\///g;

	return $result.$bookmark;
	}


	# Function: If you are viewing location "$base" which is a full URL, and
	# click on "$new" that can be full or relative - where do you get? That
	# is what this function returns.
	#
	# Input: base-URL, new-URL (where to go)
	# Returns: a full format new-URL (without bookmark)
	#
	sub merge_urls
	{
	local ($org, $new)=@_;
	local $url, $new;

	$new =~ s/[\"\']//g;

	if ($new =~ m/.*:/) {
	$url=$new;
	} elsif ($new eq "") {
	$url=$org;
	} else {
	if ($org =~ m/(.):\/\/([^\/])(.*)$/) {
	local $prot=$1;
	local $server=$2;
	local $pathanddoc=$3;
	local $path;
	local $doc=$3;
	if ($pathanddoc=~ m/^(.)\/(.)$/) {
	$path=$1;
	$doc=$2;
	}
	$doc=~s/#(.*)//;
	local $bookmark=$1;

	if ($new =~ m/^#/) {
	$url="$prot://$server$path/$doc$new";
	} elsif ($new =~ m/^\//) {
	$url="$prot://$server$new";
	} else {
	$url="$prot://$server$path/$new";
	while ($url =~ s/\/[^\/]*\/\.\.\//\//){}
	while ($url =~ s/\.\///){};
	}
	}

	}
	return $url;
	}

	sub accept_url
	{
	local ($url)=@_;
	local $ret=0;
	print "test $url\n";
	for (@basematch) {
	if ($url =~ m/$_/) {
	$ret=1;
	}
	}
	return 0 if ($ret == 0); # No basematch

	for (@ignorepatt) {
	if ($url =~ m/$_/) {
	return 0;
	}
	}
	print "match $url\n";
	return 1;
	}