Skip to content

Instantly share code, notes, and snippets.

@bpirkle
Last active October 4, 2022 03:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bpirkle/461e952ea39b2ef43cfd082d2b871ad9 to your computer and use it in GitHub Desktop.
Save bpirkle/461e952ea39b2ef43cfd082d2b871ad9 to your computer and use it in GitHub Desktop.
druid-to-csv.py
#!/usr/bin/python
# This script converts a Druid compactedList response from a native
# query of type scan to csv. It is very specific to the use case
# for which it was created. No guarantees are made regarding its
# general-purpose suitability.
#
# Most functions herein don't do much error handling.
# Instead, exceptions will be thrown, which is messy but effective.
#
# Because of the lazy error handling, the "rows written" output is
# slightly suspect. It is included mostly as a rationality check.
# If the number looks incorrect, dig deeper to see what went wrong.
import json
import csv
INPUT_FILE = 'query-output-compacted-list.json'
OUTPUT_FILE = 'query-output.csv'
EVENTS_LABEL = '"events"'
# ===========================================================
# processes a input file line
#
# line: one raw line from the druid compactedList response
# csv_output_file: csv writer
#
# returns: the number of data records written
# ===========================================================
def processOneLine(line, csv_writer):
rows_written = 0
if line.lstrip().startswith(EVENTS_LABEL):
# In the compacted output format, one line of the druid json response file (our input file)
# can and probably does contain multiple records. Therefore, one line of the input json file
# usually corresponds to multiple lines in the output csv file.
line_json = json.loads(line[line.index('['):])
rows_written += len(line_json)
csv_writer.writerows(line_json)
return rows_written
# ===========================================================
# top-level file processing function
#
# input_file: path and file name of the file containing the druid compactedList response
# output_file: destination path and file name. Overwritten without warning if existent
#
# returns: number of data records written (header row does is not counted)
# ===========================================================
def toCsv(input_file, output_file):
rows_written = 0
with open(input_file, 'rt', encoding='UTF-8') as druid_response_file:
with open(output_file, 'wt', encoding='UTF-8') as csv_output_file:
csv_writer = csv.writer(csv_output_file, quoting=csv.QUOTE_ALL)
columns = [
'event_timestamp', 'project', 'event_entity', 'event_type', 'user_text', 'user_type',
'page_title', 'page_namespace', 'page_type', 'other_tags', 'revisions',
'text_bytes_diff', 'text_bytes_diff_abs', 'text_bytes_diff_sum', 'text_bytes_diff_abs_sum',
'revisions_sum', 'events'
]
csv_writer.writerow(columns)
for line in druid_response_file:
rows_written += processOneLine(line, csv_writer)
csv_output_file.close()
druid_response_file.close()
return rows_written
# ===========================================================
# main script routine
# ===========================================================
def main():
print('------- begin to-csv.py -------')
rows_written = toCsv(INPUT_FILE, OUTPUT_FILE)
print('rows written:' + str(rows_written))
print('------- end to-csv.py -------')
# ===========================================================
# script entry point
# ===========================================================
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment