Skip to content

Instantly share code, notes, and snippets.

@briandfoy
Last active July 17, 2019 23:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save briandfoy/ba81bab108a0021f1d78e3cbc19c7d63 to your computer and use it in GitHub Desktop.
Save briandfoy/ba81bab108a0021f1d78e3cbc19c7d63 to your computer and use it in GitHub Desktop.
Turn the list of US National Park sites into JSON
#!/Users/brian/bin/perl
use v5.10;
use open qw(:std :utf8);
=head1 NAME
national_parks.pl - scrape the National Park sites
=head1 SYNOPSIS
$ perl national_parks.pl
=head1 DESCRIPTION
This program scrapes the US National Park site to create a JSON array
of each of the National Park sites.
=head1 AUTHOR & COPYRIGHT
Copyright © 2019, brian d foy <brian.d.foy@gmail.com>
=head1 LICENSE
You can use this code under the terms of the Artistic License 2.0.
=cut
use Mojo::JSON qw(encode_json to_json);
use Mojo::URL;
use Mojo::UserAgent;
use Mojo::Util qw(dumper encode decode trim);
my $dom = get_body();
my @items = $dom
->find( 'div.collapsible-item' )
->map( \&process_item )
->each;
# to_json because I've already encoded the standard filehandles
say to_json \@items;
sub process_item () {
my $dom = $_;
my $section = $dom
->at(
'div.collapsible-item-heading
h4.collapsible-item-title
a.collapsible-item-title-link
' =~ s/\v/ /gr )
->all_text =~ s/\s+\(.+//r
;
# The stuff on the inside is separated by <br /> tags
my $text = $dom->at( 'div.collapsible-item-body' );
my @lines = map { Mojo::DOM->new(trim($_)) } split /<br.*?>/, $text;
$lines[0] = Mojo::DOM->new($lines[0]->at('div')->content);
my @results;
foreach my $dom ( @lines ) {
my %hash;
# Some things don't have links. In that case, we have to find
# other ways to separate the name of the place from the
# state it's in
if( my $a = $dom->at('a') ) {
$hash{link} = Mojo::URL->new( $a->attr('href') )->base( base_link() )->to_abs->to_string;
$hash{text} = $a->text;
}
else {
$hash{link} = '';
$hash{text} = $dom->text =~ s/,.*//r;
}
$hash{state} = $dom->text =~ s/.*,\s+//gr;
trim( $hash{state} );
$hash{state} =~ s/\s*,\s*/ and /g;
if( $hash{state} =~ /\band\b/ ) {
$hash{state} = [ split /\s*and\s*/, $hash{state} ]
}
push @results, \%hash
}
{ section => $section, sites => \@results };
}
sub base_link { Mojo::URL->new('https://www.nps.gov/aboutus/national-park-system.htm') }
sub get_body {
my $path = Mojo::File->new("$0.txt");
my $text = do {
if( ! -e $path ) {
state $rc = require Mojo::UserAgent;
state $ua = Mojo::UserAgent->new;
my $tx = $ua->get( base_link() );
die "Failed" unless( $tx->result->is_success );
# the body is raw bytes, so don't encode them
open my $fh, '>:raw', $path;
print {$fh} $tx->result->body;
close $fh;
$tx->result->body;
}
else {
open my $fh, '<:utf8', $path;
do { local $/; <$fh> }
}
};
Mojo::DOM->new( $text );
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment