ag4ve/res-get.pl

## res-get.pl
#!/usr/bin/perl

use strict;
use warnings;
#use Carp::Always;

use Data::Dumper;
use Try::Tiny;

use DateTime;
use DateTime::Format::Natural;
use DateTime::Format::DBI;

use Getopt::Long;
use Pod::Usage;

use LWP;
use Web::Scraper;
use URI::Escape;

use Config::Any;

use Dir::Self;

use lib __DIR__ . "/lib";
use FR::Schema;

my $opts;
GetOptions(	'start|s=s'	=> \$opts->{start},
				'end|e=s'	=> \$opts->{end},
				'force|f=s'	=> \$opts->{force},
				'yaml|y=s'	=> \$opts->{yml},
				'dump|d'		=> \$opts->{dump},
				'debug=s'	=> \$opts->{debug},
				'help|h'		=> \$opts->{help},
				'man|m'		=> \$opts->{man}
			) or pod2usage({ -verbose => 0, -output => \*STDERR,
				-msg => "$0 no parameter found.\n
				Use -help for more options.\n"
			} );
if ($opts->{man}) {
   pod2usage( -verbose => 2 );
} elsif ($opts->{help} or !$opts->{start}) {
   pod2usage( -verbose => 0, -output => \*STDERR,
               -msg => "$0 [options]" );
}

##############
# INIT stuff #
##############

# TODO: allow alternative yaml file and document $opts
my $config = Config::Any->load_files( {
						files		=> [ ($opts->{yml} // 'config.yml') ],
						use_ext	=> 1
					} );
my $yml = $config->[0]{$opts->{yml} // 'config.yml'};
my $orgs = $yml->{orgs};
my $creds = $yml->{creds} or die "No sql information\n";

my @connect = (	'dbi:' . $creds->{type} .
						':database=' . $creds->{name} .
						($creds->{host} ? ';host=' . $creds->{host} : ""),
						$creds->{user} ? $creds->{user} : "",
						$creds->{pass} ? $creds->{pass} : ""
					);

my $schema = FR::Schema->connect(@connect);
my $text = $schema->resultset('Text');
my $dtf = $schema->storage->datetime_parser;

my $browser = LWP::UserAgent->new();
my $dtparse = DateTime::Format::Natural->new;

my $DEBUG = $opts->{debug} || $yml->{DEBUG} || "0";

my $edt;
if ($opts->{end}) {
	$edt = $dtparse->parse_datetime($opts->{end});
	die "end date parser error: " . $edt->error . "\n" unless $dtparse->success;
} else {
	$edt = DateTime->now()->set(hour => 0, minute => 0, second => 0);
}

my $sdt = $dtparse->parse_datetime($opts->{start});
die "end date parser error: " . $sdt->error . "\n" unless $dtparse->success;
die "start date must occure prior to end date\n" unless $sdt <= $edt;

my $curdt = $sdt->clone->subtract(days => 1)->set(hour => 0, minute => 0, second => 0);

my $get_org_url = get_org_url();
##############
# start work #
##############

my (@errs, $url);
while ($curdt < $edt) {
	$curdt->add(days => 1);

	print $curdt->ymd("/") . ":\t" . $curdt->day_name . " " . $curdt->month_name . " " . $curdt->day . "\n";

	foreach my $org (keys %$orgs) {

		my $hist_rs = $text->search( {
								'dates.pub_date' => {
									'-between' => [
											$dtf->format_datetime($curdt->clone->subtract(days => 1)),
											$dtf->format_datetime($curdt)
									]
								},
								'org.symbol' => $org
							}, {
								join => [ qw/dates org batch/ ]
							} );

		print "ORG: " . $orgs->{$org} . ": " . $hist_rs->count . "\n" if ($DEBUG >= 1);

		my $last_run = 0;
		if ($hist_rs->count > 0) {
			if ($opts->{force}) {
				$hist_rs->delete;
				$last_run = $hist_rs->get_column('batch.run')->max() // 0;
			} elsif ($opts->{dump}) {
				print "Have records, -dump so processing anyway\n" if ($DEBUG >= 1);
			} else {
				print "Skipped $org because " . $hist_rs->count . " records for this org already exist" .
					" use -force to process current FR data for this org\n" if ($DEBUG >= 2);
				next;
			}
		}

		my $urls = $get_org_url->($curdt, $orgs->{$org});
		print "Main urls from get_org_url: " . Dumper($urls) if ($DEBUG >= 3);
		next if (ref($urls) ne 'ARRAY' or scalar(@$urls) == 0);
		my $res;
		foreach my $url (@$urls) {
			my($ret, $err) = get_web_dat($url->{url});
			push @$res, $ret if ($ret);
			push @errs, { "page err" => $err } if ($err);
		}
		my $dtnow = DateTime->now();

		unless ($opts->{dump}) {
			foreach my $url (@$res) {
				foreach my $pagetext (@$url) {
					my $part = $pagetext->{text}{part};

					try {
						$schema->txn_do( sub {
							$text->create( {
								stamp			=> $dtnow,
								description	=> $part->{DESC} // "",
								summary		=> $part->{SUMMARY} // "",
								address		=> $part->{ADDRESS} // "",
								action		=> $part->{ACTION} // "",
								page			=> $part->{PAGE} // "",
								text_uri		=> $pagetext->{text}{link},
								pdf_uri		=> $pagetext->{pdf},
								org		=> {
									symbol	=> $org,
									web_ag	=> $part->{AGENCY} // "",
									agency	=> $orgs->{$org}
								}, dates => {
									dates		=> $part->{DATES} // "",
									pub_date	=> $curdt
								}, batch	=> {
									run		=> ($last_run + 1)
								}
							}, {
								key			=> 'text_pk'
							} );
						} );
					} catch {
						push @errs, { "db err" => { date => $curdt->ymd("/"), org => $org, page => $pagetext } };
						print "err: " . $_ . "\n";
					};
				}
			}
		} else { # -dump stuff
			print "Results: " . Dumper($res) if ($res);
		}
	}
}

print "Fail: " . Dumper(@errs) if (@errs);

sub get_org_url {
	my $urls;

	return sub {
		my ($dt, $org) = @_;
		if ($dt->day_name eq 'Saturday' or
			$dt->day_name eq 'Sunday' or
			($urls->{count} // 1) == 0 or
			($urls->{$dt->year}{count} // 1) == 0 or
			($urls->{$dt->year}{$dt->month_name}{count} // 1) == 0 or
			($urls->{$dt->year}{$dt->month_name}{$dt->day}{count} // 1) == 0)
		{
			print "Skipping get_org_url for $org\n" if ($DEBUG >= 2);
			return;
		}
		(my $regex = $org) =~ s| |\.\+|g;
		$regex =~ s|^|\^\.\*|;
		$regex =~ s|$|\.\*\$|;

		# get urls of years
		if (!$urls->{$dt->year}{url}) {
			$urls = get_date_url(undef, 'year');
			die "No year data on " . $dt->year if (!$urls->{$dt->year}{url});
		}
		# get urls of months
		if (!$urls->{$dt->year}{$dt->month_name}{url}) {
			$urls->{$dt->year} = get_date_url($urls->{$dt->year}{url}, 'month');
			die "No month data on " . $dt->month_name if (!$urls->{$dt->year}{$dt->month_name}{url});
		}
		# get urls of days
		if (!$urls->{$dt->year}{$dt->month_name}{$dt->day}{url}) {
			$urls->{$dt->year}{$dt->month_name} = get_date_url($urls->{$dt->year}{$dt->month_name}{url}, 'day');
			if (!$urls->{$dt->year}{$dt->month_name}{$dt->day}{url}) {
				warn "No day data on " . $dt->day_name . " " . $dt->ymd;
				return;
			}
		}
		# get urls of orgs
		if (!$urls->{$dt->year}{$dt->month_name}{$dt->day}{$org}{url}) {
			$urls->{$dt->year}{$dt->month_name}{$dt->day} = get_date_url($urls->{$dt->year}{$dt->month_name}{$dt->day}{url}, 'org');
			if (scalar keys %{$urls->{$dt->year}{$dt->month_name}{$dt->day}} <= 1) {
				warn "No rules on " . $dt->ymd("/");
				return;
			}
		}

		my $orgmatch;
		my $orgdat = $urls->{$dt->year}{$dt->month_name}{$dt->day};
		my @names = grep { $_ =~ qr/$regex/ } keys %{$orgdat};
		foreach my $name (@names) {
			push @$orgmatch, $orgdat->{$name};
		}
		print "urls: " . Dumper($urls) if ($DEBUG >= 5);
		print "get_org_url orgdat: " . Dumper($orgdat) .
			"\n\torgmatch: " . Dumper($orgmatch) .
			"\n\tnames: " . Dumper(@names) . "\n" if ($DEBUG >= 3);
		return $orgmatch // '';
	};
}

sub get_date_url {
	my ($append, $type) = @_;

	print "get_date_url params: " . Dumper(@_) if ($DEBUG >= 2);

	my $scr_process = scraper {
		process '//a[@onclick]', 'onclick[]' => sub {
			my $link = shift;

			my ($end_url) = $link->attr('onclick') =~ /(\?[\w\=\&\%\+\-]+)/;
			my $date = $link->as_text;
			$date =~ s/^\s+//;
			$date =~ s/\s+$//;
			return $date, $end_url;
		};
		result 'onclick';
	};

	my $scraper = {
		year => scraper {
			process '//div[@class="level1 browse-level"]', 'data[]' => $scr_process;
		},
		month => scraper {
			process '//div[@class="level2 browse-level"]', 'data[]' => $scr_process;
		},
		day => scraper {
			process '//div[@class="level3 browse-level"]', 'data[]' => $scr_process;
		},
		org => scraper {
			process '//div[@class="level4 browse-leaf-level "]', 'data[]' => $scr_process;
		},
	};

	my $uri = $yml->{scraper}{base_url} . ($append // $yml->{scraper}{base_path});
	my $content = process_uri($uri);

	my $res;
	foreach my $data (@{$scraper->{$type}->scrape($content)->{data}}) {
		if ($type eq 'day') {
			my ($daynum) = $data->[0] =~ /(\d+)/;
			$res->{$daynum} = { url => $data->[1] };
		} else {
			$res->{$data->[0]} = { url => $data->[1] };
		}
	}
	$res->{count} = scalar keys %{$res};
	$res->{url} = $append if ($append);

	print "get_date_url Result: " . Dumper($res) if ($DEBUG >= 5);
	return $res;
}

sub get_web_dat { # return $data, $error
	my ($append) = @_;
	print "get_web_dat params: " . Dumper(@_) if ($DEBUG >= 2);

	my $fr = scraper {
		process '//td[@class="browse-download-links"]', 'info[]' => scraper {
			process '//a[@href =~ /htm/]', 'text' => sub {
				my $link = shift;
				my $response = $browser->get($link->attr('href'));
				warn "Unable to get file from: " . $link->attr('href') . "\nResponse status: " . $response->status_line
					unless $response->is_success;
				my $text = $response->content;

				return { 'err' => "Word found" } if ($yml->{scraper}{discard} and grep { $text =~ qr/$_/ } @{$yml->{scraper}{discard}});
				my @split = map { s/^\[|\]$//m; $_ } split /\]\n[\S ]*\[|\n{2,}/, $text;

				my %match;
				for my $i (0 .. $#split) {
					my ($sec, $desc);
					($match{PAGE}) = $split[$i] =~ /Pages? ([\d-]+)/ if (!defined($match{PAGE}));
					($sec, $desc) = $split[$i] =~ m/^([A-Z]+):\s*(.*)/msg;
					$match{$sec} = $desc if ($sec and $desc);

					if (($match{ACTION} or $match{AGENCY}) and !$match{DESC}) {
						$match{DESC} = $split[$i - 1];
					}
				}

				s/ *\n+/ /g for values %match;
				return { 'link' => $link->attr('href'), 'part' => {%match} };
			};
			process '//a[@href =~ /pdf/]', 'pdf' => '@href';
		};
	};

	my $uri = $yml->{scraper}{base_url} . $append;
	my $content = process_uri($uri);
	my $sections = $fr->scrape($content);
	print "get_web_dat sections: " . Dumper($sections) if ($DEBUG >= 4);
	return $sections->{info} if (!$yml->{scraper}{sections});
	my ($ret, $err);

	foreach my $i (0 .. $#{$sections->{info}}) {
		if (grep { defined $sections->{info}[$i]{text}{part}{$_} } @{$yml->{scraper}{sections}}) {
			push @$ret, $sections->{info}[$i];
		} else {
			push @$err, $sections->{info}[$i];
		}
	}
	return $ret, $err;
}

sub process_uri {
	my ($uri) = @_;

	print "url: $uri\n" if ($DEBUG >= 1);
	my $response = $browser->get($uri);
	die "Failed to get url: " . $uri . "\nResponse status: " . $response->status_line
		unless ($response->is_success);
	return $response->content;
}


=head1 SYNOPSIS

Options:

=over 14

=item B<-s, -start>

Date to start the parser on. Spaces and other special characters need quoting.

=item B<-e, -end>

Date to end the parser on. (Optional) Without this option the parser will run to the current date.

=item B<-f, -force>

Forces data for each agency currently in the yaml file to be deleted from the database for each day selected and new data to be fetched and inserted.
Without this option, if a record exists for an agency in the database, no data will be retrieved for that agency on that day.

=item B<-y, -yaml>

Specify a yaml file to use. (Default: config.yaml)

=item B<-d, -dump>

Dump all data that has been scraped to the screen.

=item B<-debug>

Enable debugging output levels. Prints verbose data for debugging. 0 - none, 5 - max.

=item B<-h, -help>

Print these options.

=item B<-m, -man>

View all documentation.

=back

=head1 DESCRIPTION

This script scrapes the Federal Register site.

=cut
	#!/usr/bin/perl

	use strict;
	use warnings;
	#use Carp::Always;

	use Data::Dumper;
	use Try::Tiny;

	use DateTime;
	use DateTime::Format::Natural;
	use DateTime::Format::DBI;

	use Getopt::Long;
	use Pod::Usage;

	use LWP;
	use Web::Scraper;
	use URI::Escape;

	use Config::Any;

	use Dir::Self;

	use lib __DIR__ . "/lib";
	use FR::Schema;

	my $opts;
	GetOptions( 'start\|s=s' => \$opts->{start},
	'end\|e=s' => \$opts->{end},
	'force\|f=s' => \$opts->{force},
	'yaml\|y=s' => \$opts->{yml},
	'dump\|d' => \$opts->{dump},
	'debug=s' => \$opts->{debug},
	'help\|h' => \$opts->{help},
	'man\|m' => \$opts->{man}
	) or pod2usage({ -verbose => 0, -output => \*STDERR,
	-msg => "$0 no parameter found.\n
	Use -help for more options.\n"
	} );
	if ($opts->{man}) {
	pod2usage( -verbose => 2 );
	} elsif ($opts->{help} or !$opts->{start}) {
	pod2usage( -verbose => 0, -output => \*STDERR,
	-msg => "$0 [options]" );
	}

	##############
	# INIT stuff #
	##############

	# TODO: allow alternative yaml file and document $opts
	my $config = Config::Any->load_files( {
	files => [ ($opts->{yml} // 'config.yml') ],
	use_ext => 1
	} );
	my $yml = $config->[0]{$opts->{yml} // 'config.yml'};
	my $orgs = $yml->{orgs};
	my $creds = $yml->{creds} or die "No sql information\n";

	my @connect = ( 'dbi:' . $creds->{type} .
	':database=' . $creds->{name} .
	($creds->{host} ? ';host=' . $creds->{host} : ""),
	$creds->{user} ? $creds->{user} : "",
	$creds->{pass} ? $creds->{pass} : ""
	);

	my $schema = FR::Schema->connect(@connect);
	my $text = $schema->resultset('Text');
	my $dtf = $schema->storage->datetime_parser;

	my $browser = LWP::UserAgent->new();
	my $dtparse = DateTime::Format::Natural->new;

	my $DEBUG = $opts->{debug} \|\| $yml->{DEBUG} \|\| "0";

	my $edt;
	if ($opts->{end}) {
	$edt = $dtparse->parse_datetime($opts->{end});
	die "end date parser error: " . $edt->error . "\n" unless $dtparse->success;
	} else {
	$edt = DateTime->now()->set(hour => 0, minute => 0, second => 0);
	}

	my $sdt = $dtparse->parse_datetime($opts->{start});
	die "end date parser error: " . $sdt->error . "\n" unless $dtparse->success;
	die "start date must occure prior to end date\n" unless $sdt <= $edt;

	my $curdt = $sdt->clone->subtract(days => 1)->set(hour => 0, minute => 0, second => 0);

	my $get_org_url = get_org_url();
	##############
	# start work #
	##############

	my (@errs, $url);
	while ($curdt < $edt) {
	$curdt->add(days => 1);

	print $curdt->ymd("/") . ":\t" . $curdt->day_name . " " . $curdt->month_name . " " . $curdt->day . "\n";

	foreach my $org (keys %$orgs) {

	my $hist_rs = $text->search( {
	'dates.pub_date' => {
	'-between' => [
	$dtf->format_datetime($curdt->clone->subtract(days => 1)),
	$dtf->format_datetime($curdt)
	]
	},
	'org.symbol' => $org
	}, {
	join => [ qw/dates org batch/ ]
	} );

	print "ORG: " . $orgs->{$org} . ": " . $hist_rs->count . "\n" if ($DEBUG >= 1);

	my $last_run = 0;
	if ($hist_rs->count > 0) {
	if ($opts->{force}) {
	$hist_rs->delete;
	$last_run = $hist_rs->get_column('batch.run')->max() // 0;
	} elsif ($opts->{dump}) {
	print "Have records, -dump so processing anyway\n" if ($DEBUG >= 1);
	} else {
	print "Skipped $org because " . $hist_rs->count . " records for this org already exist" .
	" use -force to process current FR data for this org\n" if ($DEBUG >= 2);
	next;
	}
	}

	my $urls = $get_org_url->($curdt, $orgs->{$org});
	print "Main urls from get_org_url: " . Dumper($urls) if ($DEBUG >= 3);
	next if (ref($urls) ne 'ARRAY' or scalar(@$urls) == 0);
	my $res;
	foreach my $url (@$urls) {
	my($ret, $err) = get_web_dat($url->{url});
	push @$res, $ret if ($ret);
	push @errs, { "page err" => $err } if ($err);
	}
	my $dtnow = DateTime->now();

	unless ($opts->{dump}) {
	foreach my $url (@$res) {
	foreach my $pagetext (@$url) {
	my $part = $pagetext->{text}{part};

	try {
	$schema->txn_do( sub {
	$text->create( {
	stamp => $dtnow,
	description => $part->{DESC} // "",
	summary => $part->{SUMMARY} // "",
	address => $part->{ADDRESS} // "",
	action => $part->{ACTION} // "",
	page => $part->{PAGE} // "",
	text_uri => $pagetext->{text}{link},
	pdf_uri => $pagetext->{pdf},
	org => {
	symbol => $org,
	web_ag => $part->{AGENCY} // "",
	agency => $orgs->{$org}
	}, dates => {
	dates => $part->{DATES} // "",
	pub_date => $curdt
	}, batch => {
	run => ($last_run + 1)
	}
	}, {
	key => 'text_pk'
	} );
	} );
	} catch {
	push @errs, { "db err" => { date => $curdt->ymd("/"), org => $org, page => $pagetext } };
	print "err: " . $_ . "\n";
	};
	}
	}
	} else { # -dump stuff
	print "Results: " . Dumper($res) if ($res);
	}
	}
	}

	print "Fail: " . Dumper(@errs) if (@errs);

	sub get_org_url {
	my $urls;

	return sub {
	my ($dt, $org) = @_;
	if ($dt->day_name eq 'Saturday' or
	$dt->day_name eq 'Sunday' or
	($urls->{count} // 1) == 0 or
	($urls->{$dt->year}{count} // 1) == 0 or
	($urls->{$dt->year}{$dt->month_name}{count} // 1) == 0 or
	($urls->{$dt->year}{$dt->month_name}{$dt->day}{count} // 1) == 0)
	{
	print "Skipping get_org_url for $org\n" if ($DEBUG >= 2);
	return;
	}
	(my $regex = $org) =~ s\| \|\.\+\|g;
	$regex =~ s\|^\|\^\.\*\|;
	$regex =~ s\|$\|\.\*\$\|;

	# get urls of years
	if (!$urls->{$dt->year}{url}) {
	$urls = get_date_url(undef, 'year');
	die "No year data on " . $dt->year if (!$urls->{$dt->year}{url});
	}
	# get urls of months
	if (!$urls->{$dt->year}{$dt->month_name}{url}) {
	$urls->{$dt->year} = get_date_url($urls->{$dt->year}{url}, 'month');
	die "No month data on " . $dt->month_name if (!$urls->{$dt->year}{$dt->month_name}{url});
	}
	# get urls of days
	if (!$urls->{$dt->year}{$dt->month_name}{$dt->day}{url}) {
	$urls->{$dt->year}{$dt->month_name} = get_date_url($urls->{$dt->year}{$dt->month_name}{url}, 'day');
	if (!$urls->{$dt->year}{$dt->month_name}{$dt->day}{url}) {
	warn "No day data on " . $dt->day_name . " " . $dt->ymd;
	return;
	}
	}
	# get urls of orgs
	if (!$urls->{$dt->year}{$dt->month_name}{$dt->day}{$org}{url}) {
	$urls->{$dt->year}{$dt->month_name}{$dt->day} = get_date_url($urls->{$dt->year}{$dt->month_name}{$dt->day}{url}, 'org');
	if (scalar keys %{$urls->{$dt->year}{$dt->month_name}{$dt->day}} <= 1) {
	warn "No rules on " . $dt->ymd("/");
	return;
	}
	}

	my $orgmatch;
	my $orgdat = $urls->{$dt->year}{$dt->month_name}{$dt->day};
	my @names = grep { $_ =~ qr/$regex/ } keys %{$orgdat};
	foreach my $name (@names) {
	push @$orgmatch, $orgdat->{$name};
	}
	print "urls: " . Dumper($urls) if ($DEBUG >= 5);
	print "get_org_url orgdat: " . Dumper($orgdat) .
	"\n\torgmatch: " . Dumper($orgmatch) .
	"\n\tnames: " . Dumper(@names) . "\n" if ($DEBUG >= 3);
	return $orgmatch // '';
	};
	}

	sub get_date_url {
	my ($append, $type) = @_;

	print "get_date_url params: " . Dumper(@_) if ($DEBUG >= 2);

	my $scr_process = scraper {
	process '//a[@onclick]', 'onclick[]' => sub {
	my $link = shift;

	my ($end_url) = $link->attr('onclick') =~ /(\?[\w\=\&\%\+\-]+)/;
	my $date = $link->as_text;
	$date =~ s/^\s+//;
	$date =~ s/\s+$//;
	return $date, $end_url;
	};
	result 'onclick';
	};

	my $scraper = {
	year => scraper {
	process '//div[@class="level1 browse-level"]', 'data[]' => $scr_process;
	},
	month => scraper {
	process '//div[@class="level2 browse-level"]', 'data[]' => $scr_process;
	},
	day => scraper {
	process '//div[@class="level3 browse-level"]', 'data[]' => $scr_process;
	},
	org => scraper {
	process '//div[@class="level4 browse-leaf-level "]', 'data[]' => $scr_process;
	},
	};

	my $uri = $yml->{scraper}{base_url} . ($append // $yml->{scraper}{base_path});
	my $content = process_uri($uri);

	my $res;
	foreach my $data (@{$scraper->{$type}->scrape($content)->{data}}) {
	if ($type eq 'day') {
	my ($daynum) = $data->[0] =~ /(\d+)/;
	$res->{$daynum} = { url => $data->[1] };
	} else {
	$res->{$data->[0]} = { url => $data->[1] };
	}
	}
	$res->{count} = scalar keys %{$res};
	$res->{url} = $append if ($append);

	print "get_date_url Result: " . Dumper($res) if ($DEBUG >= 5);
	return $res;
	}

	sub get_web_dat { # return $data, $error
	my ($append) = @_;
	print "get_web_dat params: " . Dumper(@_) if ($DEBUG >= 2);

	my $fr = scraper {
	process '//td[@class="browse-download-links"]', 'info[]' => scraper {
	process '//a[@href =~ /htm/]', 'text' => sub {
	my $link = shift;
	my $response = $browser->get($link->attr('href'));
	warn "Unable to get file from: " . $link->attr('href') . "\nResponse status: " . $response->status_line
	unless $response->is_success;
	my $text = $response->content;

	return { 'err' => "Word found" } if ($yml->{scraper}{discard} and grep { $text =~ qr/$_/ } @{$yml->{scraper}{discard}});
	my @split = map { s/^\[\|\]$//m; $_ } split /\]\n[\S ]*\[\|\n{2,}/, $text;

	my %match;
	for my $i (0 .. $#split) {
	my ($sec, $desc);
	($match{PAGE}) = $split[$i] =~ /Pages? ([\d-]+)/ if (!defined($match{PAGE}));
	($sec, $desc) = $split[$i] =~ m/^([A-Z]+):\s(.)/msg;
	$match{$sec} = $desc if ($sec and $desc);

	if (($match{ACTION} or $match{AGENCY}) and !$match{DESC}) {
	$match{DESC} = $split[$i - 1];
	}
	}

	s/ *\n+/ /g for values %match;
	return { 'link' => $link->attr('href'), 'part' => {%match} };
	};
	process '//a[@href =~ /pdf/]', 'pdf' => '@href';
	};
	};

	my $uri = $yml->{scraper}{base_url} . $append;
	my $content = process_uri($uri);
	my $sections = $fr->scrape($content);
	print "get_web_dat sections: " . Dumper($sections) if ($DEBUG >= 4);
	return $sections->{info} if (!$yml->{scraper}{sections});
	my ($ret, $err);

	foreach my $i (0 .. $#{$sections->{info}}) {
	if (grep { defined $sections->{info}[$i]{text}{part}{$_} } @{$yml->{scraper}{sections}}) {
	push @$ret, $sections->{info}[$i];
	} else {
	push @$err, $sections->{info}[$i];
	}
	}
	return $ret, $err;
	}

	sub process_uri {
	my ($uri) = @_;

	print "url: $uri\n" if ($DEBUG >= 1);
	my $response = $browser->get($uri);
	die "Failed to get url: " . $uri . "\nResponse status: " . $response->status_line
	unless ($response->is_success);
	return $response->content;
	}


	=head1 SYNOPSIS

	Options:

	=over 14

	=item B<-s, -start>

	Date to start the parser on. Spaces and other special characters need quoting.

	=item B<-e, -end>

	Date to end the parser on. (Optional) Without this option the parser will run to the current date.

	=item B<-f, -force>

	Forces data for each agency currently in the yaml file to be deleted from the database for each day selected and new data to be fetched and inserted.
	Without this option, if a record exists for an agency in the database, no data will be retrieved for that agency on that day.

	=item B<-y, -yaml>

	Specify a yaml file to use. (Default: config.yaml)

	=item B<-d, -dump>

	Dump all data that has been scraped to the screen.

	=item B<-debug>

	Enable debugging output levels. Prints verbose data for debugging. 0 - none, 5 - max.

	=item B<-h, -help>

	Print these options.

	=item B<-m, -man>

	View all documentation.

	=back

	=head1 DESCRIPTION

	This script scrapes the Federal Register site.

	=cut