Created
March 7, 2010 14:30
-
-
Save xaicron/324387 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
use strict; | |
use warnings; | |
use autodie; | |
use Config::Pit qw/pit_get/; | |
use Web::Scraper qw/scraper process/; | |
use WWW::Mechanize; | |
use LWP::Simple qw/mirror/; | |
use File::Basename qw/basename/; | |
$|++; | |
my $id = shift || die 'Usage: get_twitpic user_id'; | |
my $conf = pit_get 'twitter.com'; | |
my $uri_base = 'http://twitpic.com'; | |
mkdir $id unless -d $id; | |
chdir $id; | |
my $mech = WWW::Mechanize->new(autocheck => 1); | |
$mech->get('http://twitpic.com/'); | |
$mech->submit_form( | |
fields => { | |
username => $conf->{username}, | |
password => $conf->{password}, | |
}, | |
); | |
my $list = scraper { | |
process '//*[@class="profile-photo-img"]/a', 'links[]' => '@href'; | |
process '//*[@id="profile-photos"]/*/a[@class="nav"]', 'navi_urls[]' => '@href'; | |
process '//*[@id="profile-photos"]/*/a[@class="nav"]', 'navi_texts[]' => 'TEXT'; | |
}; | |
my $image = scraper { | |
process '//*[@id="photo"]/img', 'img' => [ '@src', sub { "$_[0]" } ]; | |
}; | |
main(); | |
exit; | |
sub main { | |
my $res = get_images("$uri_base/photos/$id"); | |
while (my $path = get_next_url($res)) { | |
$res = get_images("$uri_base/$path"); | |
} | |
} | |
sub get_images { | |
my $url = shift; | |
$mech->get($url); | |
my $res = $list->scrape($mech->content); | |
for my $src (@{$res->{links}}) { | |
$mech->get("$uri_base$src"); | |
my $img = $image->scrape($mech->content)->{img}; | |
_store($img); | |
} | |
return $res; | |
} | |
sub _store { | |
my $url = shift; | |
(my $file = basename $url) =~ s/\?.*//; | |
$file =~ s/-[\w+.]+-\w+//; | |
print STDERR "SKIP: $file\n" and return if -e $file; | |
print "getting $file ... "; | |
mirror $url, $file; | |
print "done.\n"; | |
} | |
sub get_next_url { | |
my $res = shift; | |
my $urls = $res->{navi_urls}; | |
my $texts = $res->{navi_texts}; | |
return $urls->[-1] if @$urls >= 2 && $texts->[-1] =~ /OLDER/ && $urls->[-1] =~ /\?page=/; | |
return $urls->[0] if $texts->[0] =~ /OLDER/ && $urls->[0] =~ /\?page=/; | |
return; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment