Skip to content

Instantly share code, notes, and snippets.

@justinjm
Forked from igama/bigquery_schema.py
Created August 3, 2016 20:10
Show Gist options
  • Save justinjm/63006ff0214ee37606df35f9ec20a3f2 to your computer and use it in GitHub Desktop.
Save justinjm/63006ff0214ee37606df35f9ec20a3f2 to your computer and use it in GitHub Desktop.
Sense / infer / generate a big query schema string for import #bigquery
import mimetypes
import sys
from collections import OrderedDict
filename = sys.argv[1]
def file_type(filename):
type = mimetypes.guess_type(filename)
return type
filetype = file_type(filename)[1]
if filetype == "gzip":
import gzip
readfile = gzip.GzipFile(filename, 'r')
else:
readfile = open(filename,'r')
with readfile as f:
header = next(f).strip().split("\t")
lines = [dict(zip(header,next(f).strip().split("\t"))) for x in xrange(500)]
schema = OrderedDict(zip(header, [bool]*len(header)))
def boolify(s):
if s == 'True' or s == "TRUE" or s == "T":
return True
if s == 'False' or s == "FALSE" or s == "F":
return False
raise ValueError("huh?")
def autoconvert(s):
for fn in (boolify, int, float):
try:
return fn(s)
except ValueError:
pass
return s
type_precedence = {str:0, float:1, int:2,bool:3}
type_map = {str:"STRING", float:"FLOAT", int:"INTEGER", bool:"BOOL"}
# Sense header
for line in lines:
for k,v in line.items():
sense_type = type(autoconvert(v))
if schema[k] == sense_type or schema[k] == str:
pass
elif type_precedence[schema[k]] > type_precedence[sense_type]:
schema[k] = sense_type
print ','.join([ k + ":" + type_map[v] for k,v in schema.items()])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment