Created
April 13, 2012 07:27
-
-
Save Ovid/2374806 to your computer and use it in GitHub Desktop.
Federal Register XML Documents
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Fetch Federal Register daily XML documents | |
# I decided not to include this in my book. | |
# | |
# This script is for daily updates and will be slow. Go to | |
# http://www.gpo.gov/fdsys/bulkdata/FR to download a year | |
# worth of documents at a time in .zip format. This script | |
# can then keep you up to date. | |
use strict; | |
use warnings; | |
use utf8::all; | |
use LWP::Simple; | |
use DateTime; | |
use DateTime::Format::Strptime; | |
my $sleep = 3; | |
my $dir = 'federal_register'; | |
my $url_format = 'http://www.gpo.gov/fdsys/pkg/FR-%s/xml/FR-%s.xml'; | |
if ( not -d $dir ) { | |
mkdir $dir or die "Cannot mkdir '$dir': $!"; | |
} | |
chdir $dir or die "Cannot chdir($dir): $!"; | |
my $formatter = DateTime::Format::Strptime->new( pattern => '%F' ); | |
my $start_date = DateTime->new( | |
year => 2000, | |
day => 18, | |
month => 1, | |
formatter => $formatter, | |
); | |
my $end_date = DateTime->today; | |
while ( $start_date < $end_date ) { | |
my $url = sprintf $url_format => $start_date, $start_date; | |
my $filename = "$start_date.xml"; | |
$start_date->add( days => 1 ); | |
next if -f $filename; # we already have it | |
print "Fetching $url\n"; | |
my $xml = get($url); | |
if ($xml) { | |
open my $fh, '>', $filename | |
or die "Cannot open $filename for writing: $!"; | |
print $fh $xml; | |
} | |
else { | |
warn "Failed to get $url"; | |
} | |
sleep $sleep; # be nice to the US government | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment