xaicron/gist:95591

## gistfile1.PL
# pixivから指定したユーザーIDの画像一覧を取得する
# Usage: $0 [-i] id [-d save_directory] [-p max_page_number] [-e encode]
#
# 2009-12-26T02:40:21 漫画形式も対応した
# 2010-01-23T05:18:17 form_numberを指定しないと動かなくなってた
#

use strict;
use warnings;
use utf8;
use Web::Scraper;
use WWW::Mechanize;
use Encode qw/find_encoding decode_utf8/;
use File::Basename qw/fileparse/;
use File::Spec::Functions qw/catfile/;
use Getopt::Long qw/GetOptions/;
use Config::Pit qw/pit_get/;

# ファイルシステムのEncoding
my $default_encoding = 'cp932';

# Config::Pitで設定しておいてね！
my $config = pit_get('pixiv.net');
my $username = $config->{username} or die 'username not found.';
my $password = $config->{password} or die 'password not found.';

# WindowsFSの禁則文字
my %win32_taboo = (
     '\\' => '￥',
     '/'  => '／',
     ':'  => '：',
     '*'  => '＊',
     '?'  => '？',
     '"'  => '″',
     '<'  => '＜',
     '>'  => '＞',
     '|'  => '｜',
);

GetOptions(\my %opt, 'id=i', 'directory=s', 'page=i', 'encode=s', 'stop');

# オプションチェック
my $id = shift || $opt{id} || die "Usage: $0 [-i] id [-d save_directory] [-p max_page_number] [-e encode]";
my $dir = $opt{directory} || $id;
my $max_page = $opt{page} || 1;
my $enc = $opt{encode} || $default_encoding;
$enc = find_encoding($enc);

# メカ
my $mech = WWW::Mechanize->new(autocheck => 1);

# ログイン
$mech->get('http://www.pixiv.net/index.php');
$mech->submit_form(
    form_number => 2,
    fields => {
        pixiv_id => $username,
        pass     => $password,
    },
);

$mech->res->is_success or die "login faild. status: ", $mech->status;

mkdir $dir or die "$dir $!" unless -d $dir;
chdir $dir or die "$dir $!";

# Scraper
my $scraper = scraper {
    process '//*[@id="illust_c4"]/ul/li', 'list[]' => scraper {
        process 'a', url => [ '@href', sub {
            (my $url = $_->as_string) =~ s/mode=medium/mode=big/;
            return $url;
        } ];
        process 'a>img', img => [ '@src', sub {
            (my $img = $_->as_string) =~ s/_s(\.[^.]+)$/$1/;
            return $img;
        } ];
        process 'div', title => 'TEXT';
    };
};

my $is_manga_sp = scraper {
    process '/html/head/title', title => 'TEXT';
};

my $manga_tb_sp = scraper {
    process '//*[@class="tb_div"]', 'tb_list[]' => scraper {
        process 'a>img', img => [ '@src', sub { "$_" } ];
    };
    process '//*[@class="manga_title"]/a', title => 'TEXT';
};

# main
&main();exit;

sub main {
    get_pict($_, $mech) for (1..$max_page);
}

sub get_pict {
    my $page = shift;
    my $mech = shift;

    my $url = "http://www.pixiv.net/member_illust.php?id=" . $id;
    $url .= sprintf("&p=%d", $page) if $page;

    $mech->get($url);
    $mech->res->is_success or die "get content faild. status:", $mech->status;

    my $res = $scraper->scrape($mech->content, $mech->uri);

    for my $row (@{$res->{list}}) {
        $mech->add_header( Referer => $row->{url} );

        if (is_manga($row->{url})) {
            get_manga($row->{url});
            next;
        }

        save_file($row->{img}, $row->{title});
    }
}

sub is_manga {
    my $url = shift;
    (my $manga_url = $url) =~ s/mode=big/mode=manga/;
    $mech->get($manga_url);
    my $res = $is_manga_sp->scrape($mech->content, $mech->uri);

    return $res->{title} =~ /の漫画 \[pixiv\]$/ ? 1 : 0;
}

sub get_manga {
    my $url = shift;
    (my $tb_url = $url) =~ s/mode=big/mode=manga_tb/;
    $mech->get($tb_url);
    my $res = $manga_tb_sp->scrape($mech->content, $mech->uri);

    for my $data (@{$res->{tb_list}}) {
        save_file($data->{img}, $res->{title});
    }
}

sub save_file {
    my $url = shift;
    my $title = shift;

    $title = _win32_file_normalize($title) if $^O eq 'MSWin32';

    my ($img_id, $ext) = (fileparse $url, qr/\.(?:jpe?g|gif|png|bmp)$/)[0,2];

    my $file = catfile(
        $enc->encode(
            sprintf ('%s_%s%s', $img_id, $title, $ext),
            sub{ sprintf "U+%04X", shift },
        )
    );

    if (-f $file) {
        if ($opt{stop}) {
            print "up to date.\n";
            exit;
        }

        print "skip $file\n";
        return;
    }

    print "Download $file\n";
    $mech->get($url);

    open my $fh, '>', $file or die "$file $!";
    binmode $fh;
    print {$fh} $mech->content;
    close $fh;
}

sub _win32_file_normalize {
    my $file_name = shift;
    $file_name =~ s#([\\/:*?"<>|])#$win32_taboo{$1}#ge;
    return $file_name;
}

## gistfile2.js
// pixivの個別ユーザーTOPで使う
(function(d,i){
    d=document.getElementById('profile').childNodes[1].textContent.replace(/^\s*|\s*$/,'');
    i=location.href.match(/id=(\d+)/)[1];
    alert('pixiv_get.pl -i ' + i + ' -d ' + d);
})();
	# pixivから指定したユーザーIDの画像一覧を取得する
	# Usage: $0 [-i] id [-d save_directory] [-p max_page_number] [-e encode]
	#
	# 2009-12-26T02:40:21 漫画形式も対応した
	# 2010-01-23T05:18:17 form_numberを指定しないと動かなくなってた
	#

	use strict;
	use warnings;
	use utf8;
	use Web::Scraper;
	use WWW::Mechanize;
	use Encode qw/find_encoding decode_utf8/;
	use File::Basename qw/fileparse/;
	use File::Spec::Functions qw/catfile/;
	use Getopt::Long qw/GetOptions/;
	use Config::Pit qw/pit_get/;

	# ファイルシステムのEncoding
	my $default_encoding = 'cp932';

	# Config::Pitで設定しておいてね！
	my $config = pit_get('pixiv.net');
	my $username = $config->{username} or die 'username not found.';
	my $password = $config->{password} or die 'password not found.';

	# WindowsFSの禁則文字
	my %win32_taboo = (
	'\\' => '￥',
	'/' => '／',
	':' => '：',
	'*' => '＊',
	'?' => '？',
	'"' => '″',
	'<' => '＜',
	'>' => '＞',
	'\|' => '｜',
	);

	GetOptions(\my %opt, 'id=i', 'directory=s', 'page=i', 'encode=s', 'stop');

	# オプションチェック
	my $id = shift \|\| $opt{id} \|\| die "Usage: $0 [-i] id [-d save_directory] [-p max_page_number] [-e encode]";
	my $dir = $opt{directory} \|\| $id;
	my $max_page = $opt{page} \|\| 1;
	my $enc = $opt{encode} \|\| $default_encoding;
	$enc = find_encoding($enc);

	# メカ
	my $mech = WWW::Mechanize->new(autocheck => 1);

	# ログイン
	$mech->get('http://www.pixiv.net/index.php');
	$mech->submit_form(
	form_number => 2,
	fields => {
	pixiv_id => $username,
	pass => $password,
	},
	);

	$mech->res->is_success or die "login faild. status: ", $mech->status;

	mkdir $dir or die "$dir $!" unless -d $dir;
	chdir $dir or die "$dir $!";

	# Scraper
	my $scraper = scraper {
	process '//*[@id="illust_c4"]/ul/li', 'list[]' => scraper {
	process 'a', url => [ '@href', sub {
	(my $url = $_->as_string) =~ s/mode=medium/mode=big/;
	return $url;
	} ];
	process 'a>img', img => [ '@src', sub {
	(my $img = $_->as_string) =~ s/_s(\.[^.]+)$/$1/;
	return $img;
	} ];
	process 'div', title => 'TEXT';
	};
	};

	my $is_manga_sp = scraper {
	process '/html/head/title', title => 'TEXT';
	};

	my $manga_tb_sp = scraper {
	process '//*[@class="tb_div"]', 'tb_list[]' => scraper {
	process 'a>img', img => [ '@src', sub { "$_" } ];
	};
	process '//*[@class="manga_title"]/a', title => 'TEXT';
	};

	# main
	&main();exit;

	sub main {
	get_pict($_, $mech) for (1..$max_page);
	}

	sub get_pict {
	my $page = shift;
	my $mech = shift;

	my $url = "http://www.pixiv.net/member_illust.php?id=" . $id;
	$url .= sprintf("&p=%d", $page) if $page;

	$mech->get($url);
	$mech->res->is_success or die "get content faild. status:", $mech->status;

	my $res = $scraper->scrape($mech->content, $mech->uri);

	for my $row (@{$res->{list}}) {
	$mech->add_header( Referer => $row->{url} );

	if (is_manga($row->{url})) {
	get_manga($row->{url});
	next;
	}

	save_file($row->{img}, $row->{title});
	}
	}

	sub is_manga {
	my $url = shift;
	(my $manga_url = $url) =~ s/mode=big/mode=manga/;
	$mech->get($manga_url);
	my $res = $is_manga_sp->scrape($mech->content, $mech->uri);

	return $res->{title} =~ /の漫画 \[pixiv\]$/ ? 1 : 0;
	}

	sub get_manga {
	my $url = shift;
	(my $tb_url = $url) =~ s/mode=big/mode=manga_tb/;
	$mech->get($tb_url);
	my $res = $manga_tb_sp->scrape($mech->content, $mech->uri);

	for my $data (@{$res->{tb_list}}) {
	save_file($data->{img}, $res->{title});
	}
	}

	sub save_file {
	my $url = shift;
	my $title = shift;

	$title = _win32_file_normalize($title) if $^O eq 'MSWin32';

	my ($img_id, $ext) = (fileparse $url, qr/\.(?:jpe?g\|gif\|png\|bmp)$/)[0,2];

	my $file = catfile(
	$enc->encode(
	sprintf ('%s_%s%s', $img_id, $title, $ext),
	sub{ sprintf "U+%04X", shift },
	)
	);

	if (-f $file) {
	if ($opt{stop}) {
	print "up to date.\n";
	exit;
	}

	print "skip $file\n";
	return;
	}

	print "Download $file\n";
	$mech->get($url);

	open my $fh, '>', $file or die "$file $!";
	binmode $fh;
	print {$fh} $mech->content;
	close $fh;
	}

	sub _win32_file_normalize {
	my $file_name = shift;
	$file_name =~ s#([\\/:*?"<>\|])#$win32_taboo{$1}#ge;
	return $file_name;
	}
	// pixivの個別ユーザーTOPで使う
	(function(d,i){
	d=document.getElementById('profile').childNodes[1].textContent.replace(/^\s\|\s$/,'');
	i=location.href.match(/id=(\d+)/)[1];
	alert('pixiv_get.pl -i ' + i + ' -d ' + d);
	})();