Created
February 16, 2012 20:00
-
-
Save rjurney/1847430 to your computer and use it in GitHub Desktop.
Returning a bag from a Jython Pig UDF
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
register /me/pig/build/ivy/lib/Pig/avro-1.5.3.jar | |
register /me/pig/build/ivy/lib/Pig/json-simple-1.1.jar | |
register /me/pig/contrib/piggybank/java/piggybank.jar | |
register /me/pig/build/ivy/lib/Pig/jackson-core-asl-1.7.3.jar | |
register /me/pig/build/ivy/lib/Pig/jackson-mapper-asl-1.7.3.jar | |
define AvroStorage org.apache.pig.piggybank.storage.avro.AvroStorage(); | |
register 'udfs.py' using jython as myfuncs; | |
rmf /tmp/jython_test.txt | |
emails = load '/me/tmp/again_inbox/part-0-0.avro' using AvroStorage(); | |
emails = limit emails 100; | |
emails = filter emails by (from is not null) and (to is not null); | |
pairs = foreach emails generate flatten(from) as from, flatten(to) as to, subject; | |
gft = group pairs by (from, to); | |
gft = foreach gft generate group, pairs.(subject) as subjects; | |
to_from_word_counts = foreach gft generate myfuncs.bag_test(group, subjects); /* Will only return a bag of sorted counted subjects */ | |
store to_from_word_counts into '/tmp/jython_test.txt'; | |
---------------------- | |
@outputSchema("bag:{t2:(word:chararray, total:int)}") | |
def bag_test(group, subjects): | |
word_counts = {} | |
for subject in subjects: | |
words = subject[0].split() | |
for word in words: | |
word_counts[word] = word_counts.get(word, 0) + 1 | |
return sorted(word_counts.items(), key=lambda word_count: word_count[1], reverse=True) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment