Skip to content

Instantly share code, notes, and snippets.

@santrancisco
Last active September 24, 2020 12:09
Show Gist options
  • Save santrancisco/d0aec75dccdfc534ce9bca145fafcd02 to your computer and use it in GitHub Desktop.
Save santrancisco/d0aec75dccdfc534ce9bca145fafcd02 to your computer and use it in GitHub Desktop.
Use AWS glue result to flatten json input, generate redshift create table command and jsonpaths file for COPY job
import json
import argparse
import re
parser = argparse.ArgumentParser()
parser.add_argument("file").required
args=parser.parse_args()
typemap={
"longstring":"VARCHAR(MAX)",
"string":"VARCHAR(MAX)",
"int":"INTEGER",
"boolean":"BOOLEAN",
"double":"REAL"
}
f=open(args.file,'r').read()
j=json.loads(f)
# print (json.dumps(j))
col = None
typestr="type"
namestr="name"
if j.get("TableList") != None: ## This is when we bother to run `aws glue get-tables`
col=j["TableList"][0]["StorageDescriptor"]["Columns"]
typestr="Type"
namestr="Name"
else: ## This is when we deal with json copied straight from AWS Console WebUI network traffic (aka being lazy)
col=j["actionResponses"][0]["data"]["tableVersions"][0]["table"]["storageDescriptor"]["columns"]
mymap = {}
for i in col:
if i[typestr].startswith("struct<"):
s=re.sub("array<[^>]*>","longstring",i[typestr])
s=re.sub("map<[^>]*>","longstring",s)
s=s.replace("struct<","{").replace(">","}")
s=re.sub(r'(?<={|,)([^:]*)(?=:)', r'"\1"', s)
s=re.sub(r'(?<=:)([a-zA-Z0-9]*)(?=}|,)', r'"\1"', s)
inner = json.loads(s)
mymap[i[namestr]] = inner
# print(s)
else:
t=i[typestr]
t=re.sub("array<[^>]*>","longstring",i[typestr])
t=re.sub("map<[^>]*>","longstring",t)
mymap[i[namestr]]=t
crt = "CREATE TABLE IF NOT EXISTS segment.eventlogs ("
jsonpaths = {"jsonpaths":[]}
jpath=[]
def simplerecurse(d,prefix,jprefix):
global crt
global jpath
for i in d:
# print ("%s - %s"%(i,d[i]))
if isinstance(d[i],dict):
p = prefix+i+"_"
jp = jprefix+"['"+i+"']"
simplerecurse(d[i],p,jp)
else:
crt += "\n%s%s\t%s,"%(prefix,i.replace(" ","").replace("-","_"),typemap[d[i]])
jpath.append(jprefix+"['"+i+"']")
simplerecurse(mymap,"","$")
jsonpaths["jsonpaths"] = jpath
crt=crt[:-1]+"\n)"
f=open("crt.txt","w")
f.write(crt)
f.close()
f=open("jsonpath.json","w")
f.write(json.dumps(jsonpaths, indent=4))
f.close()
print (crt)
print (json.dumps(jsonpaths, indent=4))
@santrancisco
Copy link
Author

This is a quick parser for glue result in aws. Glue is very good at identifying the data structure you deal with. After glue runs, you can run aws glue get-tables --database-name {your db name} > glue.json to get the tablelist and their data structure. This file contains column information for your data. This script run over it and create CREATE TABLE command for AWS Redshift and an aligned jsonpath.json file that you can use to load data using COPY command straight from S3 to your Redshift.
Eg:

COPY SchemaName.TableName from 's3://bucketname/objectid'
iam_role 'arn:aws:iam::0123456789012:role/MyRedshiftRole'
JSON 's3://bucketname_where_you_host/jsonpath.json' GZIP
region 'us-west-1';

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment