Skip to content

Instantly share code, notes, and snippets.

@xaicron
Created April 15, 2009 03:43
Show Gist options
  • Save xaicron/95591 to your computer and use it in GitHub Desktop.
Save xaicron/95591 to your computer and use it in GitHub Desktop.
pixivから指定したユーザーIDの画像一覧を取得する
# pixivから指定したユーザーIDの画像一覧を取得する
# Usage: $0 [-i] id [-d save_directory] [-p max_page_number] [-e encode]
#
# 2009-12-26T02:40:21 漫画形式も対応した
# 2010-01-23T05:18:17 form_numberを指定しないと動かなくなってた
#
use strict;
use warnings;
use utf8;
use Web::Scraper;
use WWW::Mechanize;
use Encode qw/find_encoding decode_utf8/;
use File::Basename qw/fileparse/;
use File::Spec::Functions qw/catfile/;
use Getopt::Long qw/GetOptions/;
use Config::Pit qw/pit_get/;
# ファイルシステムのEncoding
my $default_encoding = 'cp932';
# Config::Pitで設定しておいてね!
my $config = pit_get('pixiv.net');
my $username = $config->{username} or die 'username not found.';
my $password = $config->{password} or die 'password not found.';
# WindowsFSの禁則文字
my %win32_taboo = (
'\\' => '¥',
'/' => '/',
':' => ':',
'*' => '*',
'?' => '?',
'"' => '″',
'<' => '<',
'>' => '>',
'|' => '|',
);
GetOptions(\my %opt, 'id=i', 'directory=s', 'page=i', 'encode=s', 'stop');
# オプションチェック
my $id = shift || $opt{id} || die "Usage: $0 [-i] id [-d save_directory] [-p max_page_number] [-e encode]";
my $dir = $opt{directory} || $id;
my $max_page = $opt{page} || 1;
my $enc = $opt{encode} || $default_encoding;
$enc = find_encoding($enc);
# メカ
my $mech = WWW::Mechanize->new(autocheck => 1);
# ログイン
$mech->get('http://www.pixiv.net/index.php');
$mech->submit_form(
form_number => 2,
fields => {
pixiv_id => $username,
pass => $password,
},
);
$mech->res->is_success or die "login faild. status: ", $mech->status;
mkdir $dir or die "$dir $!" unless -d $dir;
chdir $dir or die "$dir $!";
# Scraper
my $scraper = scraper {
process '//*[@id="illust_c4"]/ul/li', 'list[]' => scraper {
process 'a', url => [ '@href', sub {
(my $url = $_->as_string) =~ s/mode=medium/mode=big/;
return $url;
} ];
process 'a>img', img => [ '@src', sub {
(my $img = $_->as_string) =~ s/_s(\.[^.]+)$/$1/;
return $img;
} ];
process 'div', title => 'TEXT';
};
};
my $is_manga_sp = scraper {
process '/html/head/title', title => 'TEXT';
};
my $manga_tb_sp = scraper {
process '//*[@class="tb_div"]', 'tb_list[]' => scraper {
process 'a>img', img => [ '@src', sub { "$_" } ];
};
process '//*[@class="manga_title"]/a', title => 'TEXT';
};
# main
&main();exit;
sub main {
get_pict($_, $mech) for (1..$max_page);
}
sub get_pict {
my $page = shift;
my $mech = shift;
my $url = "http://www.pixiv.net/member_illust.php?id=" . $id;
$url .= sprintf("&p=%d", $page) if $page;
$mech->get($url);
$mech->res->is_success or die "get content faild. status:", $mech->status;
my $res = $scraper->scrape($mech->content, $mech->uri);
for my $row (@{$res->{list}}) {
$mech->add_header( Referer => $row->{url} );
if (is_manga($row->{url})) {
get_manga($row->{url});
next;
}
save_file($row->{img}, $row->{title});
}
}
sub is_manga {
my $url = shift;
(my $manga_url = $url) =~ s/mode=big/mode=manga/;
$mech->get($manga_url);
my $res = $is_manga_sp->scrape($mech->content, $mech->uri);
return $res->{title} =~ /の漫画 \[pixiv\]$/ ? 1 : 0;
}
sub get_manga {
my $url = shift;
(my $tb_url = $url) =~ s/mode=big/mode=manga_tb/;
$mech->get($tb_url);
my $res = $manga_tb_sp->scrape($mech->content, $mech->uri);
for my $data (@{$res->{tb_list}}) {
save_file($data->{img}, $res->{title});
}
}
sub save_file {
my $url = shift;
my $title = shift;
$title = _win32_file_normalize($title) if $^O eq 'MSWin32';
my ($img_id, $ext) = (fileparse $url, qr/\.(?:jpe?g|gif|png|bmp)$/)[0,2];
my $file = catfile(
$enc->encode(
sprintf ('%s_%s%s', $img_id, $title, $ext),
sub{ sprintf "U+%04X", shift },
)
);
if (-f $file) {
if ($opt{stop}) {
print "up to date.\n";
exit;
}
print "skip $file\n";
return;
}
print "Download $file\n";
$mech->get($url);
open my $fh, '>', $file or die "$file $!";
binmode $fh;
print {$fh} $mech->content;
close $fh;
}
sub _win32_file_normalize {
my $file_name = shift;
$file_name =~ s#([\\/:*?"<>|])#$win32_taboo{$1}#ge;
return $file_name;
}
// pixivの個別ユーザーTOPで使う
(function(d,i){
d=document.getElementById('profile').childNodes[1].textContent.replace(/^\s*|\s*$/,'');
i=location.href.match(/id=(\d+)/)[1];
alert('pixiv_get.pl -i ' + i + ' -d ' + d);
})();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment