Skip to content

Instantly share code, notes, and snippets.

@turbo
Forked from igrigorik/json-bq-schema-generator.rb
Created September 12, 2017 21:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save turbo/0e4186bb499df8e85bcfbd9d0c8c2a88 to your computer and use it in GitHub Desktop.
Save turbo/0e4186bb499df8e85bcfbd9d0c8c2a88 to your computer and use it in GitHub Desktop.
BigQuery JSON schema generator
require 'open-uri'
require 'zlib'
require 'yajl'
# References
# - https://developers.google.com/bigquery/preparing-data-for-bigquery#dataformats
# - https://developers.google.com/bigquery/docs/data#nested
#
def type(t)
return 'FLOAT' if t.is_a?(Float)
return 'INTEGER' if t.is_a?(Integer)
return 'STRING' if t.is_a?(String)
return 'BOOLEAN' if t.is_a?(TrueClass) || t.is_a?(FalseClass)
return 'RECORD' if t.is_a?(Hash)
return type(t.first) if t.is_a?(Array)
puts "Unknown type for #{t}, #{t.class}"
raise Exception
end
def mode(e)
if e.is_a? Array
'REPEATED'
else
'NULLABLE'
end
end
def traverse(target, event)
event.each_pair do |k,v|
desc = target.find {|e| e['name'] == k} || {}
target << desc if desc.empty?
desc['name'] = k
# Note: we skip empty REPEATED fields until we encounter a non-empty one.
# This may result in empty REPEATED declarations, which will be rejected
# by BigQuery... You'll have to handle this on your own.
next if v.nil? || (v.is_a?(Array) && v.first.nil?)
desc['type'] = type(v)
desc['mode'] = mode(v)
if desc['type'] == 'RECORD'
desc['fields'] ||= []
v = [v] if desc['mode'] != 'REPEATED'
v.each do |e|
traverse(desc['fields'], e) unless e.nil?
end
end
end
end
@fields = []
file = ARGV[0]
data = open(file)
if File.extname(file) == '.gz'
data = Zlib::GzipReader.new(StringIO.new(data.read)).read
end
Yajl::Parser.parse(data) do |event|
traverse(@fields, event)
end
def check(target)
target.each do |field|
if !(field.has_key?('name') && !field['name'].nil? &&
field.has_key?('type') && !field['type'].nil?)
STDERR.puts "Warning: #{field} has an unknown type."
field['type'] = 'STRING'
field['mode'] = 'NULLABLE'
end
if field['fields']
check(field['fields'])
end
end
end
check(@fields)
puts Yajl::Encoder.encode(@fields, :pretty => true)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment