Last active
January 2, 2016 08:28
-
-
Save davidfauth/8276345 to your computer and use it in GitHub Desktop.
fbo contracts analysis
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* FBO_Data | |
*/ | |
%default INPUT_PATH '/Users/davidfauth/fbo_data/fbo_data_active.csv' | |
%default INPUT_NEW_PATH '/Users/davidfauth/fbo_data/fbo_data_pig/fbo_data_archive_12_13_tab.txt' | |
%default INPUT_DATA_PATH '/Users/davidfauth/fbo_data/fbo_data_pig' | |
%default OUTPUT_PATH '/Users/davidfauth/MortarBillsData' | |
/** | |
* User-Defined Functions (UDFs) | |
*/ | |
REGISTER '/Users/davidfauth/mortarProjects/fbo_data/udfs/python/fbo_data.py' USING streaming_python AS nltk_udfs; | |
fy12_13_data = LOAD '$INPUT_DATA_PATH/fbo_data_archive_12_13_tab.txt' | |
USING PigStorage('\t') | |
AS (postedDate:chararray, | |
classCode:chararray, | |
naicsCode:chararray, | |
agencyName:chararray, | |
title:chararray, | |
solicitationNumber:chararray, | |
responseDeadline:chararray, | |
pocEmail:chararray, | |
setAside:chararray, | |
popAddress:chararray, | |
popCity:chararray, | |
popZip:chararray, | |
popCountry:chararray, | |
placeOfPerformanceText:chararray, | |
noticeType:chararray, | |
contractAwardNumber:chararray, | |
contractAwardAmount:chararray, | |
contractAwardDate:chararray, | |
awardee:chararray, | |
contractorAwardedDuns:chararray, | |
noticeID:chararray); | |
/* get Awards */ | |
activeHasAward = FILTER active_data BY (noticeType == 'Award Notice' AND contractAwardAmount IS NOT NULL); | |
fy0004HasAward = FILTER fy00_04_data BY (noticeType == 'Award Notice' AND contractAwardAmount IS NOT NULL); | |
fy0507HasAward = FILTER fy05_07_data BY (noticeType == 'Award Notice' AND contractAwardAmount IS NOT NULL); | |
fy0809HasAward = FILTER fy08_09_data BY (noticeType == 'Award Notice' AND contractAwardAmount IS NOT NULL); | |
fy1011HasAward = FILTER fy10_11_data BY (noticeType == 'Award Notice' AND contractAwardAmount IS NOT NULL); | |
fy1213HasAward = FILTER fy12_13_data BY (noticeType == 'Award Notice' AND contractAwardAmount IS NOT NULL); | |
/*group Data */ | |
allAwardData = UNION activeHasAward, fy0004HasAward, fy0507HasAward, fy0809HasAward, fy1011HasAward, fy1213HasAward; | |
groupAllAwardDataAgency = FOREACH allAwardData generate agencyName, (float) nltk_udfs.cleanAmount(contractAwardAmount) AS fboAmount; | |
groupAllAwardDataAgencyAwardee = FOREACH allAwardData generate agencyName, awardee, (float) nltk_udfs.cleanAmount(contractAwardAmount) AS fboAmount; | |
rawAwardsByAgency = GROUP groupAllAwardDataAgency BY (agencyName); | |
agencyAwardSummary = foreach rawAwardsByAgency GENERATE FLATTEN(group), COUNT(groupAllAwardDataAgency) as countAwardsByAgency,SUM(groupAllAwardDataAgency.fboAmount) as sumAwardAmt; | |
orderedAgencyAwardSummary = ORDER agencyAwardSummary BY countAwardsByAgency DESC; | |
-- remove any existing data | |
rmf $OUTPUT_PATH; | |
-- store the results | |
STORE orderedAgencyAwardSummary INTO '$OUTPUT_PATH/agencyAwardDetails' USING PigStorage('|'); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment