Last active
August 29, 2015 14:17
-
-
Save mwinkle/20dcc092588427a75eff to your computer and use it in GitHub Desktop.
python Processing Json docs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# this is a python streaming program designed to be called from a Hive query | |
# this will process a complex json document, and will return the right set of columns and rows | |
# a second GIST will contain the hive query that can be used to process this | |
import sys | |
import json | |
# this returns five columns | |
# id, lessonbranch, elapsedseconds, activity, datetime | |
def process_json_document(json_doc): | |
# note, use .loads with an s, to read as a string | |
json_record = json.loads(json_doc) | |
id = json_record['_id'] | |
for rec in json_record['ActivityCountedCollection']: | |
interesting_values = [id, rec['LessonBranch'],rec['ElapsedSeconds'],rec['Activity'],rec['DateTime']] | |
# hive will treat tab delimited output as individual columns | |
# the map(str, is required to make sure that the string can be joined | |
print '\t'.join(map(str,interesting_values)) | |
for line in sys.stdin: | |
line = line.strip() | |
process_json_document(line) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment