Skip to content

Instantly share code, notes, and snippets.

@bchess
Created February 6, 2019 06:15
Show Gist options
  • Save bchess/c0d2ee3b432df41f29bf850b9ff5cc5f to your computer and use it in GitHub Desktop.
Save bchess/c0d2ee3b432df41f29bf850b9ff5cc5f to your computer and use it in GitHub Desktop.
Generate a query that expands repeated fields in bigquery tables
import string
from google.cloud import bigquery
client = bigquery.Client()
DATASET_NAME = 'TODO'
TABLE_NAME = 'TODO'
dataset_ref = client.dataset(DATASET_NAME)
table_ref = dataset_ref.table(TABLE_NAME)
table = client.get_table(table_ref)
view_query = []
ALLOWED_CHARS = string.ascii_letters + string.digits + '_'
def to_field_name(unmunged):
munged = []
for c in unmunged:
if c not in ALLOWED_CHARS:
c = '_'
munged.append(c)
return ''.join(munged)
new_schema = []
for field in table.schema:
if field.mode != 'REPEATED':
view_query.append(field.name)
new_schema.append(field)
continue
for subfield in field.fields:
if subfield.name not in ('key', 'name'):
value_field = subfield
break
for subfield in field.fields:
if subfield.name not in ('key', 'name'):
continue
# Find the top N of these
top_n_query = ('SELECT f.{subfield_name} as k, count(*) as c '
'FROM `{table.dataset_id}.{table.table_id}` '
'CROSS JOIN UNNEST ({field_name}) as f '
'GROUP BY f.{subfield_name} '
'ORDER BY c DESC '
'LIMIT 10').format(
field_name=field.name,
subfield_name=subfield.name,
table=table
)
print(top_n_query)
query_job = client.query(top_n_query)
rows = query_job.result()
for row in rows:
new_field = bigquery.schema.SchemaField(
name='{field_name}_{k}'.format(field_name=field.name, k=to_field_name(row['k'])),
field_type=value_field.field_type,
)
new_schema.append(new_field)
view_query.append(
('(SELECT {value_field.name} '
'FROM UNNEST({field.name}) '
'WHERE {subfield.name} = {k} '
'LIMIT 1) as {new_field.name}'
).format(
value_field=value_field,
field=field,
subfield=subfield,
k=repr(row['k']),
new_field=new_field
)
)
# print(new_schema)
print('SELECT ', ','.join(view_query), 'FROM {table.dataset_id}.{table.table_id}'.format(table=table))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment