Skip to content

Instantly share code, notes, and snippets.

@gray
Last active February 9, 2017 18:58
Show Gist options
  • Save gray/796139 to your computer and use it in GitHub Desktop.
Save gray/796139 to your computer and use it in GitHub Desktop.
scrape imdb to get list of best classics
#!/usr/bin/env perl
use 5.010;
use strict;
use warnings;
use Carp::Always;
use List::Util qw(any none);
use URI;
use URI::QueryParam;
use Web::Scraper::LibXML;
use constant {
MIN_YEAR => 1930,
MAX_YEAR => 1985,
MIN_VOTES => 100,
MIN_RATING => 6.5,
BASE_URI => 'http://www.imdb.com/search/title',
};
use constant INCLUDE_GENRES => qw(
action adventure comedy crime mystery sci-fi thriller war
);
use constant EXCLUDE_GENRES => qw(documentary musical);
binmode $_, ':utf8' for *STDOUT, *STDERR;
STDOUT->autoflush(1);
my $scraper = scraper {
process 'div.lister-list div.lister-item-content',
'results[]' => scraper {
process 'a[href ^= "/title/tt"]',
title => 'TEXT', url => '@href';
process 'span.genre', 'genres[]' => sub {
(my $genres = $_[0]->as_text) =~ s/\s+//g;
return split /,/, $genres;
};
process 'div.rating-list > meta[itemprop="ratingValue"]',
rating => '@content';
process 'div.rating-list > meta[itemprop="ratingCount"]',
votes => '@content';
process 'p[class=""]', 'cast[]' => sub {
my ($cast) = ($_[0]->as_text // '') =~ /\sStars:\s(.*)\s/s;
$cast =~ s/^ \s+ | \s+ $//gx;
return split /\s*,\s*/, $cast // '';
};
};
process 'div.nav a.next-page', next => '@href';
};
my $uri = URI->new(BASE_URI);
$uri->query_form(
count => 100,
num_votes => MIN_VOTES . ',',
sort => 'user_rating,desc',
title_type => 'feature',
user_rating => MIN_RATING . ',',
);
for my $year (MIN_YEAR .. MAX_YEAR) {
my $uri = $uri->clone;
$uri->query_param(release_date => "$year,$year");
{
my $res = $scraper->scrape($uri);
for my $movie (@{$res->{results}}) {
my %genres = map { lc $_ => 1 } @{$movie->{genres}};
next if any { exists $genres{$_} } EXCLUDE_GENRES;
next if none { exists $genres{$_} } INCLUDE_GENRES;
say join ' | ', $year, @$movie{qw(rating votes title url)},
join(',', sort keys %genres),
join(', ', @{$movie->{cast} // []});
}
redo if $uri = $res->{next};
}
}
continue { sleep 1 }
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment