Created
July 23, 2019 22:27
-
-
Save QuantVI/0250395f0b893eb04dd25ed249ed4169 to your computer and use it in GitHub Desktop.
The scribe takes in JSON output from ElasticSearch results. It then dynamically transforms the output into tabular format, writing the data straight to an Excel file using an applicable Excel module.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
desc1 = ''' The scribe takes in JSON output from ElasticSearch results. | |
It then dynamically transforms the output into tabluar format, | |
writing the data straight to an Excel file using an applicable | |
Excel module.''' | |
# See report version 1 "scribe_report.py" for code comments on the core build | |
# verison 1 code now in excel_printer.py | |
import json | |
import openpyxl | |
import excel_printer | |
import argparse | |
import os | |
# This next section can be its own script if necessary | |
# begin dont_overwrite ---- | |
def dont_overwrite(this_file): | |
'''Returns the next generic fileame that you can save to without | |
overwriting an existing file of that name. | |
generic name must resemble blah_nn_.ext | |
- blah: you choose this name, like "outPut" | |
- nn: two digit number, like "04" | |
- .ext: any extension, like ".csv" | |
''' | |
import sys | |
import os | |
if os.path.exists(this_file): | |
p = this_file.split('_') | |
p1int = int(p[1]) | |
next_int_str = str(p1int+1) | |
if p1int < 9: | |
next_int_str = ''.join(['0',str(p1int+1)]) | |
next_file = '_'.join([p[0],next_int_str,p[2]]) | |
return dont_overwrite(next_file) | |
else: | |
return this_file | |
# end dont_overwrite section ---- | |
output_filename='out_00_.xlsx' | |
# # # out_to = dont_overwrite(output_filename) | |
def scribe_reports(path='', existing='n', xlfile=output_filename): | |
def get_files(path_to): | |
files_loc = '/'.join((os.getcwd(),path)) | |
raw_list_of_files = os.listdir(files_loc) | |
proc_lof = [] | |
for name in raw_list_of_files: | |
if ('.json' in name): | |
proc_lof.append(name) | |
return proc_lof | |
out_to = None | |
exz = existing | |
if exz in 'yY': | |
out_to = xlfile | |
else: | |
out_to = dont_overwrite(xlfile) if xlfile == output_filename else xlfile | |
created_wb = openpyxl.Workbook() | |
created_wb.save(filename = out_to) | |
lof = get_files(path) | |
print lof | |
# new_sheet should always be the case | |
# we actually care about when to have a new file | |
# if fact, we aren;t using the "existing" parameter | |
# - if you supply a filename, we assume it exixts and append to the file. | |
# - otherwise we use a standard file name | |
# ** only supply a filename if it exists | |
# - this can be changed later to supply a "new" filename | |
# this function shoudl be modified to properly use the path | |
def read_json_from_file(filename): | |
print 'reading from: ',filename | |
op = open(filename,'rb') | |
rd = op.read() | |
op.close() | |
try: | |
js = json.loads(rd) | |
except ValueError: | |
js = {filename : 'was not valid json'} | |
return js | |
for a_file in lof: | |
jsondata = read_json_from_file(a_file) | |
struc = excel_printer.detectStructure(jsondata) | |
handled = excel_printer.handleStructuredData(jsondata, struc) | |
excel_printer.printInExcel(jsondata,handled, out_to,'new_sheet') | |
''' | |
# level_1: type B, one 1st level agg with multiple dict items in the bucket | |
qqq = open('_ohhs-status-over-year-test') | |
www = qqq.read() | |
qqq.close() | |
eee = json.loads(www) | |
# level_2 | |
aaa = open('_emma_vi_res-per-yr-with-status-code-test-2') | |
sss = aaa.read() | |
aaa.close() | |
ddd = json.loads(sss) | |
# multi_s | |
zzz = open('_test_msearch','rb') | |
xxx = zzz.read() | |
zzz.close() | |
ccc = json.loads(xxx) | |
# level_1 : type A, multiple 1st level aggs | |
ttt = open('_ohhs-codes-test','rb') | |
ggg = ttt.read() | |
ttt.close() | |
bbb = json.loads(ggg) | |
''' | |
if __name__=='__main__': | |
ar = { | |
'p' :['--path', 'location of the json input files'], | |
'x' :['--existing', 'y or n if the output xlsx already exists'], | |
'f' :['--file','if existing, location of the excel file to write to'] | |
} | |
def parse_args(): | |
parser = argparse.ArgumentParser(description = desc1) | |
parser.add_argument('-p',ar['p'][0],help= ar['p'][1],default='') | |
parser.add_argument('-x',ar['x'][0],help= ar['x'][1],default='n') | |
parser.add_argument('-f',ar['f'][0],help= ar['f'][1], | |
default=output_filename) | |
return parser.parse_args() | |
args = parse_args() | |
print args | |
scribe_reports(args.path, args.existing, args.file) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Scribe Report is a custom file I wrote at work to help download JSON data from ElasticSearch and convert these results into single or multi-sheet Excel files that colleagues in other departments could read and understand. This was part of a "data audit" process, to check the data quality of imported data from newly integrated clients.
Scribe report relies heavily on my other custom script "Excel Printer". The Excel Printer looks at the generalized structure of the JSON results (e.g. hits or aggregates with 1,2, or 3 nested levels) and converts the results into the appropriate Excel table format.
Looking at the "general" structure of the ElasticSearch JSON results meant I did not need to write a specific parser for each different ElasticSearch query for data. (The set of ElasticSearch queries were converted from originally being SQL queries for the same process)