Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
fbo contracts analysis
/**
* FBO_Data
*/
%default INPUT_PATH '/Users/davidfauth/fbo_data/fbo_data_active.csv'
%default INPUT_NEW_PATH '/Users/davidfauth/fbo_data/fbo_data_pig/fbo_data_archive_12_13_tab.txt'
%default INPUT_DATA_PATH '/Users/davidfauth/fbo_data/fbo_data_pig'
%default OUTPUT_PATH '/Users/davidfauth/MortarBillsData'
/**
* User-Defined Functions (UDFs)
*/
REGISTER '/Users/davidfauth/mortarProjects/fbo_data/udfs/python/fbo_data.py' USING streaming_python AS nltk_udfs;
fy12_13_data = LOAD '$INPUT_DATA_PATH/fbo_data_archive_12_13_tab.txt'
USING PigStorage('\t')
AS (postedDate:chararray,
classCode:chararray,
naicsCode:chararray,
agencyName:chararray,
title:chararray,
solicitationNumber:chararray,
responseDeadline:chararray,
pocEmail:chararray,
setAside:chararray,
popAddress:chararray,
popCity:chararray,
popZip:chararray,
popCountry:chararray,
placeOfPerformanceText:chararray,
noticeType:chararray,
contractAwardNumber:chararray,
contractAwardAmount:chararray,
contractAwardDate:chararray,
awardee:chararray,
contractorAwardedDuns:chararray,
noticeID:chararray);
/* get Awards */
activeHasAward = FILTER active_data BY (noticeType == 'Award Notice' AND contractAwardAmount IS NOT NULL);
fy0004HasAward = FILTER fy00_04_data BY (noticeType == 'Award Notice' AND contractAwardAmount IS NOT NULL);
fy0507HasAward = FILTER fy05_07_data BY (noticeType == 'Award Notice' AND contractAwardAmount IS NOT NULL);
fy0809HasAward = FILTER fy08_09_data BY (noticeType == 'Award Notice' AND contractAwardAmount IS NOT NULL);
fy1011HasAward = FILTER fy10_11_data BY (noticeType == 'Award Notice' AND contractAwardAmount IS NOT NULL);
fy1213HasAward = FILTER fy12_13_data BY (noticeType == 'Award Notice' AND contractAwardAmount IS NOT NULL);
/*group Data */
allAwardData = UNION activeHasAward, fy0004HasAward, fy0507HasAward, fy0809HasAward, fy1011HasAward, fy1213HasAward;
groupAllAwardDataAgency = FOREACH allAwardData generate agencyName, (float) nltk_udfs.cleanAmount(contractAwardAmount) AS fboAmount;
groupAllAwardDataAgencyAwardee = FOREACH allAwardData generate agencyName, awardee, (float) nltk_udfs.cleanAmount(contractAwardAmount) AS fboAmount;
rawAwardsByAgency = GROUP groupAllAwardDataAgency BY (agencyName);
agencyAwardSummary = foreach rawAwardsByAgency GENERATE FLATTEN(group), COUNT(groupAllAwardDataAgency) as countAwardsByAgency,SUM(groupAllAwardDataAgency.fboAmount) as sumAwardAmt;
orderedAgencyAwardSummary = ORDER agencyAwardSummary BY countAwardsByAgency DESC;
-- remove any existing data
rmf $OUTPUT_PATH;
-- store the results
STORE orderedAgencyAwardSummary INTO '$OUTPUT_PATH/agencyAwardDetails' USING PigStorage('|');
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment