Skip to content

Instantly share code, notes, and snippets.

@dann
Created August 21, 2008 14:32
Show Gist options
  • Save dann/6569 to your computer and use it in GitHub Desktop.
Save dann/6569 to your computer and use it in GitHub Desktop.
#!/usr/bin/env perl
use strict;
use warnings;
use Web::Scraper;
use HTTP::Async;
use HTTP::Request;
use Path::Class;
use LWP::UserAgent;
use Perl6::Say;
use Digest::MD5 qw(md5_hex);
use IO::All;
use Cwd;
use File::Find::Rule;
main();
sub main {
my $pages = fetch_4u_pages();
my $photo_urls = scrape_photo_urls($pages);
download_photos($photo_urls);
remove_trashes();
}
sub fetch_4u_pages {
my $nums = [ 1 .. 300 ];
my @urls = map {"http://4u.straightline.jp/?page=$_"} @{$nums};
my $async = HTTP::Async->new(slots => 1, poll_interval=>1, timeout=>720);
my $pages_map = {};
for my $url (@urls) {
my $id = $async->add( HTTP::Request->new( GET => $url ) );
$pages_map->{$id} = $url;
}
my $contents = ();
while ( my ( $response, $id ) = $async->wait_for_next_response ) {
if ( $response->is_success ) {
say 'Fetching pages: ' . $pages_map->{$id};
push @{$contents}, $response->content;
}
}
$contents;
}
sub scrape_photo_urls {
my $contents = shift;
my $scraper = scraper {
process 'p.entry-img-src', 'urls[]' => 'TEXT';
};
my @photo_urls = ();
foreach my $content ( @{$contents} ) {
my $result = $scraper->scrape($content);
push @photo_urls, @{ $result->{urls} };
}
use Data::Dumper;
@photo_urls = map { "http://" . $_ } @photo_urls;
¥@photo_urls;
}
sub download_photos {
my $photo_urls = shift;
my $async = HTTP::Async->new;
my $photos_map = {};
foreach my $url ( @{$photo_urls} ) {
my $id = $async->add( HTTP::Request->new( GET => $url ) );
$photos_map->{$id} = $url;
}
while ( my ( $response, $id ) = $async->wait_for_next_response ) {
say "Downloading...: " . $photos_map->{$id};
if ( $response->is_success ) {
eval {
my $filename = md5_hex( $photos_map->{$id} );
my $out = io( $filename . ".jpg" ) or die $!;
$out->print( $response->content );
$out->close;
};
}
}
}
sub remove_trashes {
my @files
= File::Find::Rule->file()->maxdepth(1)->name('*.jpg')->size("<3K")
->in(getcwd);
foreach my $file (@files) {
if ( -f $file ) {
file($file)->remove;
}
}
}
__END__
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment