Last active
December 26, 2015 11:49
-
-
Save davidfauth/7146577 to your computer and use it in GitHub Desktop.
Mortar Pigscript outputting to Elasticsearch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- 'Document' is the delimiter | |
-- 'event, gathering' is the tag list | |
%default OUTPUT_PATH '/Users/davidfauth/MortarBillsData' | |
REGISTER '/Users/davidfauth/mortarProjects/billsProject/udfs/python/billsProject.py' USING streaming_python AS nltk_udfs; | |
REGISTER '/Users/davidfauth/Downloads/elasticsearch-hadoop-1.3.0.M1.jar'; | |
define ESStorage org.elasticsearch.hadoop.pig.ESStorage('es.resource=govtrack/bills'); | |
bills = LOAD '/Users/davidfauth/MortarData/' | |
USING org.apache.pig.piggybank.storage.JsonLoader( | |
'bill_id:chararray, congress:chararray, official_title:chararray, updated_at:chararray, subjects_top_term:chararray,summary:map[], | |
sponsor:map[], subjects'); | |
billDetails = FOREACH bills | |
GENERATE bill_id, | |
congress, | |
official_title, | |
updated_at, | |
subjects_top_term, | |
sponsor#'name' as sponsorName:chararray, | |
sponsor#'state' as sponsorState:chararray, | |
subjects AS subjectList: {t: (subjects: chararray)}, | |
summary#'text' AS billText:chararray; | |
billSearch = FOREACH bills | |
GENERATE bill_id, | |
congress, | |
official_title, | |
updated_at, | |
subjects_top_term, | |
sponsor#'name' as sponsorName:chararray, | |
sponsor#'state' as sponsorState:chararray, | |
summary#'text' AS billText:chararray; | |
-- Group the tweets by place name and use a CPython UDF to find the top 5 bigrams | |
-- for each of these places. | |
bigrams_by_place = FOREACH (GROUP billDetails BY subjects_top_term) GENERATE | |
group AS subjects_top_term:chararray, | |
nltk_udfs.top_5_bigrams(billDetails.official_title), | |
COUNT(billDetails) AS sample_size; | |
top_100_places = LIMIT (ORDER bigrams_by_place BY sample_size DESC) 100; | |
STORE billSearch INTO 'govtrack/bills' USING org.elasticsearch.hadoop.pig.ESStorage(); | |
rmf $OUTPUT_PATH; | |
STORE top_100_places INTO '/Users/davidfauth/MortarBillsData' USING PigStorage('\t'); | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment