Skip to content

Instantly share code, notes, and snippets.

@QuantVI
Created July 23, 2019 22:27
Show Gist options
  • Save QuantVI/0250395f0b893eb04dd25ed249ed4169 to your computer and use it in GitHub Desktop.
Save QuantVI/0250395f0b893eb04dd25ed249ed4169 to your computer and use it in GitHub Desktop.
The scribe takes in JSON output from ElasticSearch results. It then dynamically transforms the output into tabular format, writing the data straight to an Excel file using an applicable Excel module.
# -*- coding: utf-8 -*-
desc1 = ''' The scribe takes in JSON output from ElasticSearch results.
It then dynamically transforms the output into tabluar format,
writing the data straight to an Excel file using an applicable
Excel module.'''
# See report version 1 "scribe_report.py" for code comments on the core build
# verison 1 code now in excel_printer.py
import json
import openpyxl
import excel_printer
import argparse
import os
# This next section can be its own script if necessary
# begin dont_overwrite ----
def dont_overwrite(this_file):
'''Returns the next generic fileame that you can save to without
overwriting an existing file of that name.
generic name must resemble blah_nn_.ext
- blah: you choose this name, like "outPut"
- nn: two digit number, like "04"
- .ext: any extension, like ".csv"
'''
import sys
import os
if os.path.exists(this_file):
p = this_file.split('_')
p1int = int(p[1])
next_int_str = str(p1int+1)
if p1int < 9:
next_int_str = ''.join(['0',str(p1int+1)])
next_file = '_'.join([p[0],next_int_str,p[2]])
return dont_overwrite(next_file)
else:
return this_file
# end dont_overwrite section ----
output_filename='out_00_.xlsx'
# # # out_to = dont_overwrite(output_filename)
def scribe_reports(path='', existing='n', xlfile=output_filename):
def get_files(path_to):
files_loc = '/'.join((os.getcwd(),path))
raw_list_of_files = os.listdir(files_loc)
proc_lof = []
for name in raw_list_of_files:
if ('.json' in name):
proc_lof.append(name)
return proc_lof
out_to = None
exz = existing
if exz in 'yY':
out_to = xlfile
else:
out_to = dont_overwrite(xlfile) if xlfile == output_filename else xlfile
created_wb = openpyxl.Workbook()
created_wb.save(filename = out_to)
lof = get_files(path)
print lof
# new_sheet should always be the case
# we actually care about when to have a new file
# if fact, we aren;t using the "existing" parameter
# - if you supply a filename, we assume it exixts and append to the file.
# - otherwise we use a standard file name
# ** only supply a filename if it exists
# - this can be changed later to supply a "new" filename
# this function shoudl be modified to properly use the path
def read_json_from_file(filename):
print 'reading from: ',filename
op = open(filename,'rb')
rd = op.read()
op.close()
try:
js = json.loads(rd)
except ValueError:
js = {filename : 'was not valid json'}
return js
for a_file in lof:
jsondata = read_json_from_file(a_file)
struc = excel_printer.detectStructure(jsondata)
handled = excel_printer.handleStructuredData(jsondata, struc)
excel_printer.printInExcel(jsondata,handled, out_to,'new_sheet')
'''
# level_1: type B, one 1st level agg with multiple dict items in the bucket
qqq = open('_ohhs-status-over-year-test')
www = qqq.read()
qqq.close()
eee = json.loads(www)
# level_2
aaa = open('_emma_vi_res-per-yr-with-status-code-test-2')
sss = aaa.read()
aaa.close()
ddd = json.loads(sss)
# multi_s
zzz = open('_test_msearch','rb')
xxx = zzz.read()
zzz.close()
ccc = json.loads(xxx)
# level_1 : type A, multiple 1st level aggs
ttt = open('_ohhs-codes-test','rb')
ggg = ttt.read()
ttt.close()
bbb = json.loads(ggg)
'''
if __name__=='__main__':
ar = {
'p' :['--path', 'location of the json input files'],
'x' :['--existing', 'y or n if the output xlsx already exists'],
'f' :['--file','if existing, location of the excel file to write to']
}
def parse_args():
parser = argparse.ArgumentParser(description = desc1)
parser.add_argument('-p',ar['p'][0],help= ar['p'][1],default='')
parser.add_argument('-x',ar['x'][0],help= ar['x'][1],default='n')
parser.add_argument('-f',ar['f'][0],help= ar['f'][1],
default=output_filename)
return parser.parse_args()
args = parse_args()
print args
scribe_reports(args.path, args.existing, args.file)
@QuantVI
Copy link
Author

QuantVI commented Jul 23, 2019

Scribe Report is a custom file I wrote at work to help download JSON data from ElasticSearch and convert these results into single or multi-sheet Excel files that colleagues in other departments could read and understand. This was part of a "data audit" process, to check the data quality of imported data from newly integrated clients.

Scribe report relies heavily on my other custom script "Excel Printer". The Excel Printer looks at the generalized structure of the JSON results (e.g. hits or aggregates with 1,2, or 3 nested levels) and converts the results into the appropriate Excel table format.

Looking at the "general" structure of the ElasticSearch JSON results meant I did not need to write a specific parser for each different ElasticSearch query for data. (The set of ElasticSearch queries were converted from originally being SQL queries for the same process)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment