Last active
March 11, 2019 22:25
-
-
Save yunga/4b98e5c2e53dbcbd4c6c to your computer and use it in GitHub Desktop.
Get Google Images search results.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
use Mojolicious::Lite; | |
use Mojo::Util qw(url_escape url_unescape); | |
use File::Path qw(make_path); | |
use Getopt::Std; | |
##### Available colors and sizes on google image search ### | |
my %color = ( | |
full => "ic:color", bw => "ic:gray", any => "", | |
black => "ic:specific,isc:black", blue => "ic:specific,isc:blue", | |
brown => "ic:specific,isc:brown", green => "ic:specific,isc:green", | |
grey => "ic:specific,isc:grey", orange => "ic:specific,isc:orange", | |
pink => "ic:specific,isc:pink", purple => "ic:specific,isc:purple", | |
red => "ic:specific,isc:red", teal => "ic:specific,isc:teal", | |
white => "ic:specific,isc:white", yellow => "ic:specific,isc:yellow", | |
); | |
my %size = ( | |
icon => "isz:i", medium => "isz:m", large => "isz:l", any => "", | |
qsvga => "isz:lt,islt:qsvga", vga => "isz:lt,islt:vga", | |
svga => "isz:lt,islt:svga", xga => "isz:lt,islt:xga", | |
"2mp" => "isz:lt,islt:2mp", "4mp" => "isz:lt,islt:4mp", | |
); | |
# Get screen size | |
my %screen = ( w => 1920, h => 1080 ); | |
@screen{"w", "h"} = ($1, $2) if `xrandr 2>&1` =~ /, current (\d+) x (\d+),/; | |
##### Get command line options (%opt) and set Search options (@tbs) ### | |
# Help, Verbose, Urls-Only, Directory, Number, Parallel, Filetype, Size, Color | |
my (%opt, @tbs); | |
getopts('hvud:n:p:f:s:c:', \%opt); | |
# Display help | |
sub help { | |
say "\n$_[0]" if defined $_[0]; | |
say qq! | |
USAGE: $0 [options] search terms | |
Get Google Images search results. Version 20150303 | |
-h Help, displays this message. | |
-v Verbose output. Actually it's more like debug infos. | |
-u Urls-only, displays the urls found and exits, don't download. | |
-d Directory where the images will be saved. | |
-n Number of urls to retrieve, between 1 and 100, defaults to 16. | |
-p Number of parallel downloads, defaults to 16. | |
-f Filetypes separated by commas, for example: png,gif | |
-s Size, locally defaults to $screen{w}x$screen{h}. Can also be: | |
icon medium large any | |
qsvga (>480x300) vga (>640x480) svga (>800x600) | |
xga (>1024x768) 2mp (>1600x1200) 4mp (>2272x1704) | |
-c Color selected in: | |
black blue brown green grey orange | |
pink purple red teal white yellow | |
color -> for full color images | |
bw -> for black and white images | |
any -> don't search colors (Default) | |
The script is silent by default. The exit code is the number of images | |
downloaded / urls printed, -1 for an error or the help message. | |
EXAMPLES: | |
$0 milkyway | |
Save 16 $screen{w}x$screen{h} milkyway images in the current directory. | |
$0 -n 50 -c bw -d "anime wall" bad apple | |
Save 50 black&white images about bad apple in the "./anime wall" folder. | |
$0 -vs 4mp macrophoto insects | |
Verbosely save 10 images larger than 2272x1704 in the current folder. | |
$0 -f gif animated | |
Search animated wallpapers | |
Have fun :) | |
Based on the idea of Tyrell Rutledge, perl stuff by Yunga Palatino. | |
See: http://reddit.com/r/commandline/2vog7b/ | |
All right reversed. Feel free to copy/modify/redistribute/print/eat/sell. | |
PS: Use at your own risks. Computer may catch fire. | |
!; | |
exit -1; | |
} | |
help if $opt{h} or !(scalar(keys %opt) + $#ARGV + 1); | |
# Color names | |
if (defined $opt{c}) { | |
$opt{c} = lc $opt{c}; | |
help("$0: Unknown color '$opt{c}'") unless defined $color{$opt{c}}; | |
push @tbs, $color{$opt{c}} if $color{$opt{c}}; | |
} | |
# Size | |
if (defined $opt{s}) { | |
$opt{s} = lc $opt{s}; | |
if (defined $size{$opt{s}}) { # Named size option | |
push @tbs, $size{$opt{s}} if $size{$opt{s}}; | |
} elsif ($opt{s} =~ /^(\d+)x(\d+)$/) { # WIDTHxHEIGHT | |
push @tbs, "isz:ex,iszw:$1,iszh:$2"; | |
} else { # Huh? | |
help("$0: Unknown size '$opt{s}'"); | |
} | |
} else { | |
push @tbs, "isz:ex,iszw:$screen{w},iszh:$screen{h}"; # Default | |
} | |
# Filetypes -- xxx: error checking against predefined filetypes? | |
my $filetypes = ""; | |
$filetypes .= "%20" . join " ", map { "filetype:$_" } split /,/, $opt{f} if ($opt{f}); | |
# Number of image to download (!= to the number of url fetched later) | |
my $numdown = 16 - 1; | |
if (defined $opt{n}) { | |
if ($opt{n} > 0) { | |
$numdown = $opt{n} - 1; | |
} else { | |
exit -1; | |
} | |
} | |
# Number of parallel downloads | |
my $parallel = $opt{p} // 16; | |
# Directory | |
my $dir = "."; | |
if (defined $opt{d}) { | |
if (-d $opt{d} or make_path($opt{d})) { | |
$dir = $opt{d}; | |
} else { | |
say "$0: There was a problem creating '$opt{d}' directory.\n$!"; | |
exit -1; | |
} | |
} | |
##### Search images ### | |
my $ua = Mojo::UserAgent->new( | |
max_redirects => 10, | |
inactivity_timeout => 30 | |
); | |
# Make the search query-ckroll ;) | |
my $q = "https://www.google.com/search?tbm=isch&" | |
. "q=" . ($#ARGV == -1 ? "%52%69%63%6b%20%41%73%74%6c%65%79" : url_escape(join " ", @ARGV) ) | |
. $filetypes | |
. "&tbs=" . join(",", @tbs); | |
# Fetch the page, select all elements of class rg_l and capture the image url -- xxx: it doesn't seem to contain duplicates | |
my @imgurl = $ua->get($q, { | |
"User-Agent" => "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.17 Chrome/24.0.1312.57" | |
} )->res->dom(".rg_l")->map(sub { m!href="http://www.google.\w+/imgres\?imgurl=(.*?)&! and $1 })->each; | |
# Print out some data, this should be called debug instead of verbose... | |
if ($opt{v}) { | |
say "Query: $q"; | |
# Check the number of image found | |
if ($#imgurl == -1) { | |
say "No result found."; | |
exit 0; | |
} elsif ($#imgurl == 0) { | |
say "1 url found! We've got a GoogleWhack!!!"; | |
} else { | |
say "Found $#imgurl urls."; | |
} | |
} | |
$numdown = $#imgurl if $#imgurl < $numdown; | |
# Display the urls and exits | |
if (defined $opt{u}) { | |
say (join "\n", @imgurl[0..$numdown]); | |
exit $numdown; | |
} | |
##### Fetch images in non-blocking way ### | |
my $count = 0; | |
my $downloader; | |
$downloader = sub { | |
my $id = shift; | |
return if !(my $url = shift @imgurl) or $count > $numdown; | |
$url = url_unescape($url) while $url ne url_unescape($url); # because it's escaped in the google link... | |
$ua->get( $url, { | |
"User-Agent" => "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.17 Chrome/24.0.1312.57", | |
"Referer" => $url, # you cannot link to our images, yeah, yeah, yeah xxx: make some white/blacklist of sites? | |
} => sub { | |
my ($ua, $tx) = @_; | |
my $url = $tx->req->url; | |
if ($numdown >= $count) { | |
if (my $res = $tx->success) { | |
# Try to get extension from mime-type | |
my $mime; | |
if (defined $res->headers->content_type) { | |
$mime = $res->headers->content_type =~ m!image/(\w+);?\s*!i ? "$1" : "unknown"; | |
} else { | |
$mime = "unknown"; | |
} | |
# "Sanitize" url path, we may use it for the filename | |
my $url_file = $url->path; | |
$url_file = url_unescape($url_file) while $url_file ne url_unescape($url_file); | |
$url_file =~ tr!a-zA-Z0-9_\-./!!cd; # we only keep those chars for filenames | |
$url_file =~ s!\s+! !g; $url_file =~ s!^ !!; $url_file =~ s! $!!; # squeeze & trim whitespace | |
$url_file =~ s!([_\-./])+!$1!g; # squeeze these too | |
my $file; | |
if ($url_file =~ m!^.*?/?([a-z0-9_.\-]+\.\w+)/?$!i) { # hostname - file.ext | |
$file = $url->host . " - $1"; | |
} elsif ($url_file =~ m!^.*?/?([a-z0-9_\-.]+)/?$!i) { # hostname - file.mimetype | |
$file = $url->host . " - $1.$mime"; | |
} else { # hostname - search query.mimetype | |
$file = $url->host . " - @ARGV.$mime"; | |
} | |
# Check if file exists, and search for index number the previous filename | |
my ($duplicate, $index, $fullname) = (0, 0, "$dir/$file"); | |
while (-f $fullname) { | |
if ( -s _ == $res->content->asset->size ) { | |
$duplicate = 1; | |
say "Got $url\n Skipped duplicate: $fullname (" . $res->content->asset->size . " bytes)" if $opt{v}; | |
last; # Same server, same name, same size, probably the same file, we skip. | |
} | |
if ($fullname =~ m!\.(\d+)\.(?:\w+)$!) { | |
$index = $1 + 1; | |
$fullname =~ s!\.(?:\d+)\.(\w+)$!.$index.$1!; | |
} else { | |
$index++; | |
$fullname =~ s!\.(\w+)$!.$index.$1!; | |
} | |
} | |
# Save file | |
if (!$duplicate) { | |
$count++; | |
$res->content->asset->move_to("$fullname"); | |
say "Got $url\n Saved " . $res->content->asset->size . " bytes as $fullname" if $opt{v}; | |
} | |
} elsif (defined $opt{v}) { | |
my $err = $tx->error; | |
say "Error: " . ($err->{code} // "") . " $err->{message} - $url"; | |
} | |
if ($count > $numdown or !scalar @imgurl) { | |
say "Downloaded $count image", $count > 1 ? "s" : "" if $opt{v}; | |
Mojo::IOLoop->stop; | |
#Mojo::IOLoop->stop_gracefully; | |
#Mojo::IOLoop->reset; | |
#exit $count; | |
} else { | |
$downloader->($id) if @imgurl; | |
} | |
} | |
}); | |
}; | |
$downloader->($_) for 1 .. $parallel; | |
Mojo::IOLoop->start unless Mojo::IOLoop->is_running; | |
exit $count; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
line 33 -> getopts('hvudw:n:f:s:c:', %opt);
append to desc => -w Write only Urls to data.txt file.
add option ->
Write urls to a html file and exit
if (defined $opt{w}) {
say (join "\n", @imgurl[0..$numdown]);
open (MYFILE, '>>data.txt');
print MYFILE (join "\n", @imgurl[0..$numdown]);
close (MYFILE);
exit;
}
need option to specify (output filename)
ability to wrap @imgurl into html link "a href"
Playing with this in ruby/sinatra