bpirkle/druid-to-csv.py

## druid-to-csv.py
#!/usr/bin/python

# This script converts a Druid compactedList response from a native
# query of type scan to csv. It is very specific to the use case
# for which it was created. No guarantees are made regarding its
# general-purpose suitability.
#
# Most functions herein don't do much error handling.
# Instead, exceptions will be thrown, which is messy but effective.
#
# Because of the lazy error handling, the "rows written" output is
# slightly suspect. It is included mostly as a rationality check.
# If the number looks incorrect, dig deeper to see what went wrong.

import json
import csv

INPUT_FILE = 'query-output-compacted-list.json'
OUTPUT_FILE = 'query-output.csv'
EVENTS_LABEL = '"events"'

# ===========================================================
# processes a input file line
#
# line: one raw line from the druid compactedList response
# csv_output_file: csv writer
#
# returns: the number of data records written
# ===========================================================
def processOneLine(line, csv_writer):
	rows_written = 0
	if line.lstrip().startswith(EVENTS_LABEL):
		# In the compacted output format, one line of the druid json response file (our input file)
		# can and probably does contain multiple records. Therefore, one line of the input json file
		# usually corresponds to multiple lines in the output csv file.
		line_json = json.loads(line[line.index('['):])
		rows_written += len(line_json)
		csv_writer.writerows(line_json)
	return rows_written

# ===========================================================
# top-level file processing function
#
# input_file: path and file name of the file containing the druid compactedList response
# output_file: destination path and file name. Overwritten without warning if existent
#
# returns: number of data records written (header row does is not counted)
# ===========================================================
def toCsv(input_file, output_file):
	rows_written = 0
	with open(input_file, 'rt', encoding='UTF-8') as druid_response_file:
		with open(output_file, 'wt', encoding='UTF-8') as csv_output_file:
			csv_writer = csv.writer(csv_output_file, quoting=csv.QUOTE_ALL)
			columns = [
				'event_timestamp', 'project', 'event_entity', 'event_type', 'user_text', 'user_type',
				'page_title', 'page_namespace', 'page_type', 'other_tags', 'revisions',
				'text_bytes_diff', 'text_bytes_diff_abs', 'text_bytes_diff_sum', 'text_bytes_diff_abs_sum',
				'revisions_sum', 'events'
			]
			csv_writer.writerow(columns)
			for line in druid_response_file:
				rows_written += processOneLine(line, csv_writer)
			csv_output_file.close()
		druid_response_file.close()
	return rows_written

# ===========================================================
# main script routine
# ===========================================================
def main():
	print('------- begin to-csv.py -------')
	rows_written = toCsv(INPUT_FILE, OUTPUT_FILE)
	print('rows written:' + str(rows_written))
	print('------- end to-csv.py -------')

# ===========================================================
#  script entry point
# ===========================================================
main()
	#!/usr/bin/python

	# This script converts a Druid compactedList response from a native
	# query of type scan to csv. It is very specific to the use case
	# for which it was created. No guarantees are made regarding its
	# general-purpose suitability.
	#
	# Most functions herein don't do much error handling.
	# Instead, exceptions will be thrown, which is messy but effective.
	#
	# Because of the lazy error handling, the "rows written" output is
	# slightly suspect. It is included mostly as a rationality check.
	# If the number looks incorrect, dig deeper to see what went wrong.

	import json
	import csv

	INPUT_FILE = 'query-output-compacted-list.json'
	OUTPUT_FILE = 'query-output.csv'
	EVENTS_LABEL = '"events"'

	# ===========================================================
	# processes a input file line
	#
	# line: one raw line from the druid compactedList response
	# csv_output_file: csv writer
	#
	# returns: the number of data records written
	# ===========================================================
	def processOneLine(line, csv_writer):
	rows_written = 0
	if line.lstrip().startswith(EVENTS_LABEL):
	# In the compacted output format, one line of the druid json response file (our input file)
	# can and probably does contain multiple records. Therefore, one line of the input json file
	# usually corresponds to multiple lines in the output csv file.
	line_json = json.loads(line[line.index('['):])
	rows_written += len(line_json)
	csv_writer.writerows(line_json)
	return rows_written

	# ===========================================================
	# top-level file processing function
	#
	# input_file: path and file name of the file containing the druid compactedList response
	# output_file: destination path and file name. Overwritten without warning if existent
	#
	# returns: number of data records written (header row does is not counted)
	# ===========================================================
	def toCsv(input_file, output_file):
	rows_written = 0
	with open(input_file, 'rt', encoding='UTF-8') as druid_response_file:
	with open(output_file, 'wt', encoding='UTF-8') as csv_output_file:
	csv_writer = csv.writer(csv_output_file, quoting=csv.QUOTE_ALL)
	columns = [
	'event_timestamp', 'project', 'event_entity', 'event_type', 'user_text', 'user_type',
	'page_title', 'page_namespace', 'page_type', 'other_tags', 'revisions',
	'text_bytes_diff', 'text_bytes_diff_abs', 'text_bytes_diff_sum', 'text_bytes_diff_abs_sum',
	'revisions_sum', 'events'
	]
	csv_writer.writerow(columns)
	for line in druid_response_file:
	rows_written += processOneLine(line, csv_writer)
	csv_output_file.close()
	druid_response_file.close()
	return rows_written

	# ===========================================================
	# main script routine
	# ===========================================================
	def main():
	print('------- begin to-csv.py -------')
	rows_written = toCsv(INPUT_FILE, OUTPUT_FILE)
	print('rows written:' + str(rows_written))
	print('------- end to-csv.py -------')

	# ===========================================================
	# script entry point
	# ===========================================================
	main()