Skip to content

Instantly share code, notes, and snippets.

@gwenshap
Created April 29, 2014 18:57
Show Gist options
  • Save gwenshap/11408870 to your computer and use it in GitHub Desktop.
Save gwenshap/11408870 to your computer and use it in GitHub Desktop.
generate table from Avro schema
#!/usr/bin/python
import json
import argparse
def convertType(type):
if type=="long":
return "bigint"
else:
return type
def gen_columns(schema):
ret = "(" + ",".join(['%s %s' % (field['name'],convertType(field['type'][0])) for (field) in schema['fields']]) + ")"
return ret
parser = argparse.ArgumentParser(description='Generate a create external table DDL that includes column definitions from a given Avro schema')
parser.add_argument('table_name',help='Name of Hive table');
parser.add_argument('location',help='HDFS path that contains table data');
parser.add_argument('AvroSchema',help='HDFS path with Avro schema');
parser.add_argument('--partitions',help='If creating a partitioned table, specify the partitions');
args = parser.parse_args();
schema_file = file('/tmp/'+args.table_name+'.schema','r')
schema_obj = json.load(schema_file)
if args.partitions:
partitions = "partitioned by " + args.partitions
else:
partitions = ""
print """CREATE EXTERNAL TABLE %s
%s
%s
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
location '%s'
TBLPROPERTIES ( 'avro.schema.url'='%s')""" % (args.table_name,gen_columns(schema_obj),partitions,args.location,args.AvroSchema);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment