Skip to content

Instantly share code, notes, and snippets.

@seungwon0
Created March 2, 2011 15:10
Show Gist options
  • Save seungwon0/851068 to your computer and use it in GitHub Desktop.
Save seungwon0/851068 to your computer and use it in GitHub Desktop.
Scrap images from dcinside gallery
#!/usr/bin/env perl
#
# dcscrap - scrap images from dcinside gallery
#
# dcscrap downloads image files from dcinside gallery.
#
# Dcinside is a South Korean internet forum. Initially established as
# a community dedicated to digital cameras and photography, it has met
# broad notoriety in Korea due to its unique nature.
#
# Original One-Liner: https://gist.github.com/729723
#
# Seungwon Jeong <seungwon0@gmail.com>
#
# Copyright (C) 2010 by Seungwon Jeong
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see
# <http://www.gnu.org/licenses/>.
use strict;
use warnings;
use 5.010;
use utf8;
use English qw( -no_match_vars );
use LWP::Simple qw( get );
use Encode qw( encode_utf8 );
use Getopt::Long;
use IO::Prompt;
use Image::ExifTool qw( ImageInfo );
use autodie;
sub print_usage {
print <<'END_USAGE';
dcscrap 0.5.1
Copyright (C) 2011 by Seungwon Jeong
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law.
Usages:
dcscrap [OPTIONS] ADDRESS
dcscrap [OPTIONS] GALLERY NUM1 [NUM2]
Arguments:
ADDRESS the address of a post
GALLERY the name of a gallery (e.g. girlsgeneration_new)
NUM1, NUM2 the serial number of a post
Options:
-C, --directory DIR change to directory DIR
-o, --overwrite always overwrite
-n, --no-overwrite do not overwrite
Examples:
dcscrap http://gall.dcinside.com/Jessica/1495723
dcscrap 'http://gall.dcinside.com/list.php?id=yoona&no=1011909&page=16&bbs='
dcscrap --directory ~/Pictures taehee 96482
dcscrap -n racinggirl 227120 227180
Please report bugs to <seungwon0@gmail.com>.
END_USAGE
return;
}
sub get_gallery_and_num_from {
my $address = shift;
my $gallery_url = quotemeta 'http://gall.dcinside.com';
# e.g. http://gall.dcinside.com/Jessica/1495723
my $pattern1 = qr{^ $gallery_url / (?<gallery>[^/]+) / (?<num>\d+) $}xms;
# e.g. http://gall.dcinside.com/list.php?id=yoona&no=1011909&...
my $pattern2 = qr{^ $gallery_url / list[.]php
[?]id= (?<gallery>[^&]+) &no= (?<num>\d+)}xms;
if ( $address =~ $pattern1 || $address =~ $pattern2 ) {
return $LAST_PAREN_MATCH{gallery}, $LAST_PAREN_MATCH{num};
}
return;
}
sub get_images {
my $arg_ref = shift;
my $gallery = $arg_ref->{gallery};
my $num1 = $arg_ref->{num1};
my $num2 = $arg_ref->{num2};
my $overwrite = $arg_ref->{overwrite};
my $no_overwrite = $arg_ref->{no_overwrite};
return if !gallery_is_available($gallery);
for my $num ( $num1 .. $num2 ) {
say "[${gallery} #${num}]"; # Print progress
my @image_srcs = get_image_srcs( $gallery, $num );
INDEX:
for my $index ( 0 .. $#image_srcs ) {
say 'Downloading image...'; # Print progress
my $image = get( $image_srcs[$index] );
if ( !defined $image ) {
warn "Cannot download $image_srcs[$index]\n";
next INDEX;
}
my $filename = "${gallery}_${num}_${index}";
my $ext = get_img_file_ext_for( \$image );
if ( defined $ext && $ext ne q{} ) {
$filename .= ".$ext";
}
save_image(
{ image_ref => \$image,
filename => $filename,
overwrite => $overwrite,
no_overwrite => $no_overwrite
}
);
}
}
return 1;
}
sub get_image_srcs {
my $gallery = shift;
my $num = shift;
my $address = "http://gall.dcinside.com/list.php?id=${gallery}&no=${num}";
my $webpage = get($address);
if ( !defined $webpage ) {
warn "Cannot fetch $address.\n";
return;
}
$webpage = encode_utf8($webpage);
return if dcinside_is_busy($webpage);
return if post_is_deleted($webpage);
# src='http://dcimg1.dcinside.com/viewimage.php...'
# or
# src="http://uccfs.paran.com/PUD/..."
my $pattern = qr{src=['"]
( http://
(?:
dcimg1[.]dcinside[.]com/viewimage[.]php
|
uccfs[.]paran[.]com/PUD/
)
[^'"]+
)}xms;
my @image_srcs = ( $webpage =~ /$pattern/xmsg );
if ( !@image_srcs ) {
warn "Cannot find any images.\n";
}
return @image_srcs;
}
sub save_image {
my $arg_ref = shift;
my $image_ref = $arg_ref->{image_ref};
my $filename = $arg_ref->{filename};
my $overwrite = $arg_ref->{overwrite};
my $no_overwrite = $arg_ref->{no_overwrite};
if ( -f $filename ) {
return if $no_overwrite;
if ( !$overwrite ) {
my $prompt = "Overwrite '$filename'? [y/n] ";
return if !prompt( $prompt, -yes_no );
}
}
say "Saving ${filename}..."; # Print progress
open my $fh, '>', $filename;
binmode $fh;
print {$fh} ${$image_ref};
close $fh;
return;
}
sub get_img_file_ext_for {
my $image_ref = shift;
my $image_info = ImageInfo( $image_ref, 'FileType' );
given ( $image_info->{FileType} ) {
when ('BMP') { return 'bmp'; }
when ('GIF') { return 'gif'; }
when ('JPEG') { return 'jpg'; }
when ('PNG') { return 'png'; }
}
return;
}
sub gallery_is_available {
my $gallery = shift;
my $address = "http://gall.dcinside.com/list.php?id=${gallery}";
my $webpage = get($address);
if ( !defined $webpage ) {
warn "Cannot fetch $address.\n";
return;
}
$webpage = encode_utf8($webpage);
if ( $webpage =~ /생성되지 [ ] 않은 [ ] 게시판/xms ) {
warn "'$gallery' gallery doesn't seem to exist.\n";
return;
}
return if dcinside_is_busy($webpage);
return 1;
}
sub dcinside_is_busy {
my $webpage = shift;
if ( $webpage =~ /사용자가 [ ] 많아/xms ) {
warn "Dcinside is busy now.\n";
return 1;
}
return;
}
sub post_is_deleted {
my $webpage = shift;
if ( $webpage =~ /해당 [ ] 게시물은 [ ] 삭제/xms ) {
warn "The post has been deleted.\n";
return 1;
}
return;
}
my $directory;
my $overwrite;
my $no_overwrite;
my @options = (
'C|directory=s' => \$directory,
'overwrite' => \$overwrite,
'no-overwrite' => \$no_overwrite,
);
if ( !GetOptions(@options) ) {
print_usage();
exit 2;
}
if ( defined $directory ) {
die "'$directory' directory does not exist.\n" if !-d $directory;
chdir $directory;
}
my ( $gallery, $num1, $num2 );
given ( scalar @ARGV ) {
when (1) {
my $address = shift;
( $gallery, $num1 ) = get_gallery_and_num_from($address);
if ( !defined $gallery || !defined $num1 ) {
print_usage();
exit 2;
}
}
when (2) {
$gallery = shift;
$num1 = shift;
}
when (3) {
$gallery = shift;
$num1 = shift;
$num2 = shift;
}
default {
print_usage();
exit 2;
}
}
$num2 //= $num1;
my $num_pattern = qr/^ [1-9] \d* $/xms;
if ( $num1 !~ $num_pattern || $num2 !~ $num_pattern ) {
print_usage();
exit 2;
}
if ( $num1 > $num2 ) {
( $num1, $num2 ) = ( $num2, $num1 );
}
my $arg_ref = {
gallery => $gallery,
num1 => $num1,
num2 => $num2,
overwrite => $overwrite,
no_overwrite => $no_overwrite,
};
get_images($arg_ref) or exit 1;
__END__
=head1 NAME
dcscrap - scrap images from dcinside gallery
=head1 SYNOPSIS
=over
=item dcscrap [OPTIONS] ADDRESS
=item dcscrap [OPTIONS] GALLERY NUM1 [NUM2]
=back
=head1 DESCRIPTION
dcscrap downloads image files from dcinside gallery.
Dcinside is a South Korean internet forum. Initially established as a
community dedicated to digital cameras and photography, it has met
broad notoriety in Korea due to its unique nature.
=head1 ARGUMENTS
=over
=item ADDRESS
the address of a post
=item GALLERY
the name of a gallery (e.g. girlsgeneration_new)
=item NUM1, NUM2
the serial number of a post
=back
=head1 OPTIONS
=over
=item -C, --directory F<DIR>
change to directory F<DIR>
=item -o, --overwrite
always overwrite
=item -n, --no-overwrite
do not overwrite
=back
=head1 EXAMPLES
dcscrap http://gall.dcinside.com/Jessica/1495723
dcscrap 'http://gall.dcinside.com/list.php?id=yoona&no=1011909&page=16&bbs='
dcscrap --directory ~/Pictures taehee 96482
dcscrap -n racinggirl 227120 227180
=back
=head1 URL
L<https://gist.github.com/851068>
=head1 AUTHOR
Seungwon Jeong E<lt>seungwon0@gmail.comE<gt>
=cut
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment