Last active
October 4, 2022 03:04
-
-
Save bpirkle/461e952ea39b2ef43cfd082d2b871ad9 to your computer and use it in GitHub Desktop.
druid-to-csv.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# This script converts a Druid compactedList response from a native | |
# query of type scan to csv. It is very specific to the use case | |
# for which it was created. No guarantees are made regarding its | |
# general-purpose suitability. | |
# | |
# Most functions herein don't do much error handling. | |
# Instead, exceptions will be thrown, which is messy but effective. | |
# | |
# Because of the lazy error handling, the "rows written" output is | |
# slightly suspect. It is included mostly as a rationality check. | |
# If the number looks incorrect, dig deeper to see what went wrong. | |
import json | |
import csv | |
INPUT_FILE = 'query-output-compacted-list.json' | |
OUTPUT_FILE = 'query-output.csv' | |
EVENTS_LABEL = '"events"' | |
# =========================================================== | |
# processes a input file line | |
# | |
# line: one raw line from the druid compactedList response | |
# csv_output_file: csv writer | |
# | |
# returns: the number of data records written | |
# =========================================================== | |
def processOneLine(line, csv_writer): | |
rows_written = 0 | |
if line.lstrip().startswith(EVENTS_LABEL): | |
# In the compacted output format, one line of the druid json response file (our input file) | |
# can and probably does contain multiple records. Therefore, one line of the input json file | |
# usually corresponds to multiple lines in the output csv file. | |
line_json = json.loads(line[line.index('['):]) | |
rows_written += len(line_json) | |
csv_writer.writerows(line_json) | |
return rows_written | |
# =========================================================== | |
# top-level file processing function | |
# | |
# input_file: path and file name of the file containing the druid compactedList response | |
# output_file: destination path and file name. Overwritten without warning if existent | |
# | |
# returns: number of data records written (header row does is not counted) | |
# =========================================================== | |
def toCsv(input_file, output_file): | |
rows_written = 0 | |
with open(input_file, 'rt', encoding='UTF-8') as druid_response_file: | |
with open(output_file, 'wt', encoding='UTF-8') as csv_output_file: | |
csv_writer = csv.writer(csv_output_file, quoting=csv.QUOTE_ALL) | |
columns = [ | |
'event_timestamp', 'project', 'event_entity', 'event_type', 'user_text', 'user_type', | |
'page_title', 'page_namespace', 'page_type', 'other_tags', 'revisions', | |
'text_bytes_diff', 'text_bytes_diff_abs', 'text_bytes_diff_sum', 'text_bytes_diff_abs_sum', | |
'revisions_sum', 'events' | |
] | |
csv_writer.writerow(columns) | |
for line in druid_response_file: | |
rows_written += processOneLine(line, csv_writer) | |
csv_output_file.close() | |
druid_response_file.close() | |
return rows_written | |
# =========================================================== | |
# main script routine | |
# =========================================================== | |
def main(): | |
print('------- begin to-csv.py -------') | |
rows_written = toCsv(INPUT_FILE, OUTPUT_FILE) | |
print('rows written:' + str(rows_written)) | |
print('------- end to-csv.py -------') | |
# =========================================================== | |
# script entry point | |
# =========================================================== | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment