Skip to content

Instantly share code, notes, and snippets.

@ag4ve
Created November 3, 2012 16:54
Show Gist options
  • Save ag4ve/4007891 to your computer and use it in GitHub Desktop.
Save ag4ve/4007891 to your computer and use it in GitHub Desktop.
#!/usr/bin/perl
use strict;
use warnings;
#use Carp::Always;
use Data::Dumper;
use Try::Tiny;
use DateTime;
use DateTime::Format::Natural;
use DateTime::Format::DBI;
use Getopt::Long;
use Pod::Usage;
use LWP;
use Web::Scraper;
use URI::Escape;
use Config::Any;
use Dir::Self;
use lib __DIR__ . "/lib";
use FR::Schema;
my $opts;
GetOptions( 'start|s=s' => \$opts->{start},
'end|e=s' => \$opts->{end},
'force|f=s' => \$opts->{force},
'yaml|y=s' => \$opts->{yml},
'dump|d' => \$opts->{dump},
'debug=s' => \$opts->{debug},
'help|h' => \$opts->{help},
'man|m' => \$opts->{man}
) or pod2usage({ -verbose => 0, -output => \*STDERR,
-msg => "$0 no parameter found.\n
Use -help for more options.\n"
} );
if ($opts->{man}) {
pod2usage( -verbose => 2 );
} elsif ($opts->{help} or !$opts->{start}) {
pod2usage( -verbose => 0, -output => \*STDERR,
-msg => "$0 [options]" );
}
##############
# INIT stuff #
##############
# TODO: allow alternative yaml file and document $opts
my $config = Config::Any->load_files( {
files => [ ($opts->{yml} // 'config.yml') ],
use_ext => 1
} );
my $yml = $config->[0]{$opts->{yml} // 'config.yml'};
my $orgs = $yml->{orgs};
my $creds = $yml->{creds} or die "No sql information\n";
my @connect = ( 'dbi:' . $creds->{type} .
':database=' . $creds->{name} .
($creds->{host} ? ';host=' . $creds->{host} : ""),
$creds->{user} ? $creds->{user} : "",
$creds->{pass} ? $creds->{pass} : ""
);
my $schema = FR::Schema->connect(@connect);
my $text = $schema->resultset('Text');
my $dtf = $schema->storage->datetime_parser;
my $browser = LWP::UserAgent->new();
my $dtparse = DateTime::Format::Natural->new;
my $DEBUG = $opts->{debug} || $yml->{DEBUG} || "0";
my $edt;
if ($opts->{end}) {
$edt = $dtparse->parse_datetime($opts->{end});
die "end date parser error: " . $edt->error . "\n" unless $dtparse->success;
} else {
$edt = DateTime->now()->set(hour => 0, minute => 0, second => 0);
}
my $sdt = $dtparse->parse_datetime($opts->{start});
die "end date parser error: " . $sdt->error . "\n" unless $dtparse->success;
die "start date must occure prior to end date\n" unless $sdt <= $edt;
my $curdt = $sdt->clone->subtract(days => 1)->set(hour => 0, minute => 0, second => 0);
my $get_org_url = get_org_url();
##############
# start work #
##############
my (@errs, $url);
while ($curdt < $edt) {
$curdt->add(days => 1);
print $curdt->ymd("/") . ":\t" . $curdt->day_name . " " . $curdt->month_name . " " . $curdt->day . "\n";
foreach my $org (keys %$orgs) {
my $hist_rs = $text->search( {
'dates.pub_date' => {
'-between' => [
$dtf->format_datetime($curdt->clone->subtract(days => 1)),
$dtf->format_datetime($curdt)
]
},
'org.symbol' => $org
}, {
join => [ qw/dates org batch/ ]
} );
print "ORG: " . $orgs->{$org} . ": " . $hist_rs->count . "\n" if ($DEBUG >= 1);
my $last_run = 0;
if ($hist_rs->count > 0) {
if ($opts->{force}) {
$hist_rs->delete;
$last_run = $hist_rs->get_column('batch.run')->max() // 0;
} elsif ($opts->{dump}) {
print "Have records, -dump so processing anyway\n" if ($DEBUG >= 1);
} else {
print "Skipped $org because " . $hist_rs->count . " records for this org already exist" .
" use -force to process current FR data for this org\n" if ($DEBUG >= 2);
next;
}
}
my $urls = $get_org_url->($curdt, $orgs->{$org});
print "Main urls from get_org_url: " . Dumper($urls) if ($DEBUG >= 3);
next if (ref($urls) ne 'ARRAY' or scalar(@$urls) == 0);
my $res;
foreach my $url (@$urls) {
my($ret, $err) = get_web_dat($url->{url});
push @$res, $ret if ($ret);
push @errs, { "page err" => $err } if ($err);
}
my $dtnow = DateTime->now();
unless ($opts->{dump}) {
foreach my $url (@$res) {
foreach my $pagetext (@$url) {
my $part = $pagetext->{text}{part};
try {
$schema->txn_do( sub {
$text->create( {
stamp => $dtnow,
description => $part->{DESC} // "",
summary => $part->{SUMMARY} // "",
address => $part->{ADDRESS} // "",
action => $part->{ACTION} // "",
page => $part->{PAGE} // "",
text_uri => $pagetext->{text}{link},
pdf_uri => $pagetext->{pdf},
org => {
symbol => $org,
web_ag => $part->{AGENCY} // "",
agency => $orgs->{$org}
}, dates => {
dates => $part->{DATES} // "",
pub_date => $curdt
}, batch => {
run => ($last_run + 1)
}
}, {
key => 'text_pk'
} );
} );
} catch {
push @errs, { "db err" => { date => $curdt->ymd("/"), org => $org, page => $pagetext } };
print "err: " . $_ . "\n";
};
}
}
} else { # -dump stuff
print "Results: " . Dumper($res) if ($res);
}
}
}
print "Fail: " . Dumper(@errs) if (@errs);
sub get_org_url {
my $urls;
return sub {
my ($dt, $org) = @_;
if ($dt->day_name eq 'Saturday' or
$dt->day_name eq 'Sunday' or
($urls->{count} // 1) == 0 or
($urls->{$dt->year}{count} // 1) == 0 or
($urls->{$dt->year}{$dt->month_name}{count} // 1) == 0 or
($urls->{$dt->year}{$dt->month_name}{$dt->day}{count} // 1) == 0)
{
print "Skipping get_org_url for $org\n" if ($DEBUG >= 2);
return;
}
(my $regex = $org) =~ s| |\.\+|g;
$regex =~ s|^|\^\.\*|;
$regex =~ s|$|\.\*\$|;
# get urls of years
if (!$urls->{$dt->year}{url}) {
$urls = get_date_url(undef, 'year');
die "No year data on " . $dt->year if (!$urls->{$dt->year}{url});
}
# get urls of months
if (!$urls->{$dt->year}{$dt->month_name}{url}) {
$urls->{$dt->year} = get_date_url($urls->{$dt->year}{url}, 'month');
die "No month data on " . $dt->month_name if (!$urls->{$dt->year}{$dt->month_name}{url});
}
# get urls of days
if (!$urls->{$dt->year}{$dt->month_name}{$dt->day}{url}) {
$urls->{$dt->year}{$dt->month_name} = get_date_url($urls->{$dt->year}{$dt->month_name}{url}, 'day');
if (!$urls->{$dt->year}{$dt->month_name}{$dt->day}{url}) {
warn "No day data on " . $dt->day_name . " " . $dt->ymd;
return;
}
}
# get urls of orgs
if (!$urls->{$dt->year}{$dt->month_name}{$dt->day}{$org}{url}) {
$urls->{$dt->year}{$dt->month_name}{$dt->day} = get_date_url($urls->{$dt->year}{$dt->month_name}{$dt->day}{url}, 'org');
if (scalar keys %{$urls->{$dt->year}{$dt->month_name}{$dt->day}} <= 1) {
warn "No rules on " . $dt->ymd("/");
return;
}
}
my $orgmatch;
my $orgdat = $urls->{$dt->year}{$dt->month_name}{$dt->day};
my @names = grep { $_ =~ qr/$regex/ } keys %{$orgdat};
foreach my $name (@names) {
push @$orgmatch, $orgdat->{$name};
}
print "urls: " . Dumper($urls) if ($DEBUG >= 5);
print "get_org_url orgdat: " . Dumper($orgdat) .
"\n\torgmatch: " . Dumper($orgmatch) .
"\n\tnames: " . Dumper(@names) . "\n" if ($DEBUG >= 3);
return $orgmatch // '';
};
}
sub get_date_url {
my ($append, $type) = @_;
print "get_date_url params: " . Dumper(@_) if ($DEBUG >= 2);
my $scr_process = scraper {
process '//a[@onclick]', 'onclick[]' => sub {
my $link = shift;
my ($end_url) = $link->attr('onclick') =~ /(\?[\w\=\&\%\+\-]+)/;
my $date = $link->as_text;
$date =~ s/^\s+//;
$date =~ s/\s+$//;
return $date, $end_url;
};
result 'onclick';
};
my $scraper = {
year => scraper {
process '//div[@class="level1 browse-level"]', 'data[]' => $scr_process;
},
month => scraper {
process '//div[@class="level2 browse-level"]', 'data[]' => $scr_process;
},
day => scraper {
process '//div[@class="level3 browse-level"]', 'data[]' => $scr_process;
},
org => scraper {
process '//div[@class="level4 browse-leaf-level "]', 'data[]' => $scr_process;
},
};
my $uri = $yml->{scraper}{base_url} . ($append // $yml->{scraper}{base_path});
my $content = process_uri($uri);
my $res;
foreach my $data (@{$scraper->{$type}->scrape($content)->{data}}) {
if ($type eq 'day') {
my ($daynum) = $data->[0] =~ /(\d+)/;
$res->{$daynum} = { url => $data->[1] };
} else {
$res->{$data->[0]} = { url => $data->[1] };
}
}
$res->{count} = scalar keys %{$res};
$res->{url} = $append if ($append);
print "get_date_url Result: " . Dumper($res) if ($DEBUG >= 5);
return $res;
}
sub get_web_dat { # return $data, $error
my ($append) = @_;
print "get_web_dat params: " . Dumper(@_) if ($DEBUG >= 2);
my $fr = scraper {
process '//td[@class="browse-download-links"]', 'info[]' => scraper {
process '//a[@href =~ /htm/]', 'text' => sub {
my $link = shift;
my $response = $browser->get($link->attr('href'));
warn "Unable to get file from: " . $link->attr('href') . "\nResponse status: " . $response->status_line
unless $response->is_success;
my $text = $response->content;
return { 'err' => "Word found" } if ($yml->{scraper}{discard} and grep { $text =~ qr/$_/ } @{$yml->{scraper}{discard}});
my @split = map { s/^\[|\]$//m; $_ } split /\]\n[\S ]*\[|\n{2,}/, $text;
my %match;
for my $i (0 .. $#split) {
my ($sec, $desc);
($match{PAGE}) = $split[$i] =~ /Pages? ([\d-]+)/ if (!defined($match{PAGE}));
($sec, $desc) = $split[$i] =~ m/^([A-Z]+):\s*(.*)/msg;
$match{$sec} = $desc if ($sec and $desc);
if (($match{ACTION} or $match{AGENCY}) and !$match{DESC}) {
$match{DESC} = $split[$i - 1];
}
}
s/ *\n+/ /g for values %match;
return { 'link' => $link->attr('href'), 'part' => {%match} };
};
process '//a[@href =~ /pdf/]', 'pdf' => '@href';
};
};
my $uri = $yml->{scraper}{base_url} . $append;
my $content = process_uri($uri);
my $sections = $fr->scrape($content);
print "get_web_dat sections: " . Dumper($sections) if ($DEBUG >= 4);
return $sections->{info} if (!$yml->{scraper}{sections});
my ($ret, $err);
foreach my $i (0 .. $#{$sections->{info}}) {
if (grep { defined $sections->{info}[$i]{text}{part}{$_} } @{$yml->{scraper}{sections}}) {
push @$ret, $sections->{info}[$i];
} else {
push @$err, $sections->{info}[$i];
}
}
return $ret, $err;
}
sub process_uri {
my ($uri) = @_;
print "url: $uri\n" if ($DEBUG >= 1);
my $response = $browser->get($uri);
die "Failed to get url: " . $uri . "\nResponse status: " . $response->status_line
unless ($response->is_success);
return $response->content;
}
=head1 SYNOPSIS
Options:
=over 14
=item B<-s, -start>
Date to start the parser on. Spaces and other special characters need quoting.
=item B<-e, -end>
Date to end the parser on. (Optional) Without this option the parser will run to the current date.
=item B<-f, -force>
Forces data for each agency currently in the yaml file to be deleted from the database for each day selected and new data to be fetched and inserted.
Without this option, if a record exists for an agency in the database, no data will be retrieved for that agency on that day.
=item B<-y, -yaml>
Specify a yaml file to use. (Default: config.yaml)
=item B<-d, -dump>
Dump all data that has been scraped to the screen.
=item B<-debug>
Enable debugging output levels. Prints verbose data for debugging. 0 - none, 5 - max.
=item B<-h, -help>
Print these options.
=item B<-m, -man>
View all documentation.
=back
=head1 DESCRIPTION
This script scrapes the Federal Register site.
=cut
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment