Created
February 22, 2013 17:02
-
-
Save nanis/5014925 to your computer and use it in GitHub Desktop.
Perl source code to clean up U.S. Federal Government Payroll data from <http://www.census.gov//govs/apes/>. Currently, it ignores the Department of Homeland Security. Dollar amounts are adjusted to 2011 dollars using the CPI. Other indexes maybe more appropriate.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
# Author: A. Sinan Unur | |
# Copyright (C) 2013 | |
# License: This is free software. You can use it under the terms | |
# of Artistic License 2.0 <http://opensource.org/licenses/Artistic-2.0> | |
# In particular: | |
# (14) Disclaimer of Warranty: THE PACKAGE IS PROVIDED BY THE COPYRIGHT | |
# HOLDER AND CONTRIBUTORS "AS IS' AND WITHOUT ANY EXPRESS OR IMPLIED | |
# WARRANTIES. THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A | |
# PARTICULAR PURPOSE, OR NON-INFRINGEMENT ARE DISCLAIMED TO THE EXTENT | |
# PERMITTED BY YOUR LOCAL LAW. UNLESS REQUIRED BY LAW, NO COPYRIGHT | |
# HOLDER OR CONTRIBUTOR WILL BE LIABLE FOR ANY DIRECT, INDIRECT, | |
# INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING IN ANY WAY OUT OF THE | |
# USE OF THE PACKAGE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
use autodie; | |
use strict; use warnings; | |
use feature 'say'; | |
use File::Slurp qw( read_dir ); | |
my @files = sort { | |
substr($a, 0, 4) cmp substr($b, 0, 4) | |
} grep /[.]txt\z/i, read_dir '.'; | |
# Fix inconsistencies across years | |
my %fixes = ( | |
'Airports' => 'Air Transportation', # 2002 and 2003 | |
'Corrections' => 'Correction', # 1997 and 1998 | |
'Housing and Community Development (' => | |
'Housing and Community Development', # 2002 | |
'Housing & Community Development' => | |
'Housing and Community Development', # 1997 - 2001 | |
'Local Libraries' => 'Libraries', # 2002 and 2003 | |
'National Defense and International' => | |
'Nat Defense/International Relations', # 2002 and 2003 | |
'Nat Defense/International Relat' => | |
'Nat Defense/International Relations', # 1997 - 2001 | |
'Other Government Adminstration' => | |
'Other Government Administration', # 1997 and 1998 | |
'Police Protection - Officers' => | |
'Police', # 2002 and 2003 | |
'Postal Service (Federal)' => | |
'Postal Service', # 2002 and 2003 | |
'Social Insurance Administration (St' => | |
'Social Insurance Administration', # 2002 | |
'Space Research and Technology' => | |
'Space Research & Technology', # 2003 | |
'Space Research and Technology (Fede' => | |
'Space Research & Technology', # 2002 | |
'Streets and Highways' => 'Highways', # 2002 and 2003 | |
'TOTAL-ALL FUNCTIONS' => 'TOTAL - ALL FUNCTIONS', # 1997 - 2000 | |
'Water Transport and Terminals' => | |
'Water Transport & Terminals', # 2002 and 2003 | |
'Welfare' => 'Public Welfare', # 2002 and 2003 | |
); | |
# ftp://ftp.bls.gov/pub/special.requests/cpi/cpiai.txt | |
my %cpi = ( | |
1997 => 160.5, | |
1998 => 163, | |
1999 => 166.6, | |
2000 => 172.2, | |
2001 => 177.1, | |
2002 => 179.9, | |
2003 => 184, | |
2004 => 188.9, | |
2005 => 195.3, | |
2006 => 201.6, | |
2007 => 207.342, | |
2008 => 215.303, | |
2009 => 214.537, | |
2010 => 218.056, | |
2011 => 224.939, | |
); | |
# 2011 dollars | |
my $base = $cpi{2011}; | |
$_ = $base/$_ for values %cpi; | |
my $fixer = sub { | |
my ($keys, $fixes) = @_; | |
return sub { | |
my $str = shift; | |
for my $k (@$keys) { | |
$str =~ s/\A\Q$k\E\z/$fixes->{$k}/ | |
and last; | |
} | |
return $str; | |
} | |
}->([sort keys %fixes], \%fixes); | |
my %data; | |
for my $file (@files) { | |
open my $in, '<', $file; | |
my ($year) = ($file =~ /\A([0-9]{4})/); | |
while (my $line = <$in>) { | |
next unless $line =~ /\ATOTAL/ .. $line =~ /\A\s+\z/; | |
next unless $line =~ /\S/; | |
chomp $line; | |
$line =~ s/[",]//g; | |
unless ( | |
$line =~ m{ | |
\A | |
(?<govfun> [^0-9]+) | |
[0-9]+ \s+ | |
[0-9]+ \s+ | |
(?<payroll> [0-9]+) | |
}x | |
) { | |
warn "Unable to parse:\n$line\n"; | |
} | |
my ($govfun, $payroll) = @+{qw(govfun payroll)}; | |
$govfun =~ s/\s+\z//; | |
$govfun =~ s/\s+/ /; | |
$govfun = $fixer->($govfun); | |
$data{$govfun}{$year} = 0 + sprintf('%.0f', $payroll * $cpi{$year}); | |
} | |
} | |
my @govfun = sort keys %data; | |
say join("\t", '', @govfun); | |
for my $year (1997 .. 2011) { | |
my @row = map $data{$_}{$year}, @govfun; | |
say join ("\t", $year, @row); | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment