mwinkle/process_json.py

## process_json.py
# this is a python streaming program designed to be called from a Hive query
# this will process a complex json document, and will return the right set of columns and rows
# a second GIST will contain the hive query that can be used to process this

import sys
import json


# this returns five columns
# id, lessonbranch, elapsedseconds, activity, datetime
def process_json_document(json_doc):
   # note, use .loads with an s, to read as a string
   json_record = json.loads(json_doc)
   id = json_record['_id']
   for rec in json_record['ActivityCountedCollection']:
      interesting_values = [id, rec['LessonBranch'],rec['ElapsedSeconds'],rec['Activity'],rec['DateTime']]
      # hive will treat tab delimited output as individual columns
      # the map(str, is required to make sure that the string can be joined
      print '\t'.join(map(str,interesting_values))


for line in sys.stdin:
    line = line.strip()
    process_json_document(line)
	# this is a python streaming program designed to be called from a Hive query
	# this will process a complex json document, and will return the right set of columns and rows
	# a second GIST will contain the hive query that can be used to process this

	import sys
	import json


	# this returns five columns
	# id, lessonbranch, elapsedseconds, activity, datetime
	def process_json_document(json_doc):
	# note, use .loads with an s, to read as a string
	json_record = json.loads(json_doc)
	id = json_record['_id']
	for rec in json_record['ActivityCountedCollection']:
	interesting_values = [id, rec['LessonBranch'],rec['ElapsedSeconds'],rec['Activity'],rec['DateTime']]
	# hive will treat tab delimited output as individual columns
	# the map(str, is required to make sure that the string can be joined
	print '\t'.join(map(str,interesting_values))



	for line in sys.stdin:
	line = line.strip()
	process_json_document(line)