Skip to content

Instantly share code, notes, and snippets.

@onishi
Created November 14, 2012 09:30
Show Gist options
  • Star 6 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save onishi/4071196 to your computer and use it in GitHub Desktop.
Save onishi/4071196 to your computer and use it in GitHub Desktop.
HTMLまるごと保存
#!/usr/bin/env perl
use strict;
use warnings;
use utf8;
use DateTime;
use Digest::SHA1 qw(sha1_hex);
use Encode;
use File::Path qw/make_path/;
use HTML::Parser;
use HTML::ResolveLink;
use HTTP::Request::Common qw/GET/;
use IO::All;
use LWP::UserAgent;
use URI;
my $path = './';
my $uri = URI->new(shift) or die;
my $now = DateTime->now;
my $ymd = $now->ymd;
my $ua = LWP::UserAgent->new(agent => 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)');
my $resolver = HTML::ResolveLink->new(base => $uri);
my $res = $ua->request(GET $uri);
my $content = $resolver->resolve($res->decoded_content);
my $dir = $uri;
$dir =~ s{[^A-Za-z0-9.]+}{-}g;
$dir =~ s{-+$}{};
$dir = "$path/$dir/$ymd/";
$dir =~ s{/+}{/}g;
make_path($dir);
my $disallow_tag = qr{script};
my $nodisplay_tag = qr{noscript};
my $result;
my $context = { disallow => 0 };
my $parser = HTML::Parser->new(
api_version => 3,
start_h => [
sub {
my($self, $tagname, $attr, $text) = @_;
if ($tagname =~ /^(?:$nodisplay_tag)$/i) {
return;
} elsif ($tagname =~ /^(?:$disallow_tag)$/i) {
$context->{disallow}++;
return;
}
$result .= "<$tagname";
for my $key (sort keys %$attr) {
$key eq '/' and next;
my $value = $attr->{$key};
if ($key =~ /^(?:src)$/i) {
$value = get_src($value);
} elsif ($tagname =~ /^(?:link)$/i and $key =~ /^(?:href)$/i) {
$value = get_link($value);
} elsif ($tagname =~ /^(?:base)$/i and $key =~ /^(?:href)$/i) {
$value = $path;
}
$result .= qq{ $key="$value"};
}
$result .= ">";
},
'self,tagname,attr,text',
],
end_h => [
sub {
my($self, $tagname, $text) = @_;
if ($tagname =~ /^(?:$nodisplay_tag)$/i) {
return;
} elsif ($tagname =~ /^(?:$disallow_tag)$/i) {
$context->{disallow}--;
return;
}
$result .= $text;
},
'self,tagname,text',
],
default_h => [
sub {
my($self, $text) = @_;
if ($context->{disallow} > 0) {
return;
}
$result .= $text;
},
'self,text',
],
);
$parser->parse($content);
$result =~ s{(<head[^>]*>)}{$1<meta http-equiv="Content-Type" content="text/html; charset=utf-8">}i; # XXX
$result = Encode::encode('utf-8', $result);
$result > io("${dir}index.html");
print "${dir}index.html\n";
sub get_src {
my $src = shift or return;
unless (-e "${dir}file") {
make_path("${dir}file");
}
my $file = $src;
$file =~ s{[^A-Za-z0-9.]+}{-}g;
if (length($file) > 255) {
$file = sha1_hex($file);
}
$file = "file/$file";
$file =~ s{/+}{/}g;
unless (-e "$dir$file") {
$ua->request(GET $src)->content >> io("$dir$file");
sleep(1); # DOS対策対策
}
$file;
}
sub get_link {
my $url = shift or return;
my $file = get_src($url);
my $io = io("$dir$file");
my $content = $io->slurp;
$content =~ s{url\(([^\)]+)\)}{
my $link = $1;
$link =~ s{^[\s\"\']+}{};
$link =~ s{[\s\"\']+$}{};
# relative link (from HTML::ResolveLink)
my $u = URI->new($link);
unless (defined $u->scheme) {
my $old = $u;
$u = $u->abs($url);
}
$link = get_src($u);
$link =~ s{^file/}{};
"url($link)";
}eg;
$content > $io;
return $file;
}
@onishi
Copy link
Author

onishi commented Nov 14, 2012

なんか適当に書いた。気が向いたらいい感じにモジュールとかにしたい

@onishi
Copy link
Author

onishi commented Nov 14, 2012

車輪の再発明感

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment