Skip to content

Instantly share code, notes, and snippets.

@thekensta
Created December 6, 2017 21:26
Show Gist options
  • Save thekensta/104e665d3b79becbd57286d14f571998 to your computer and use it in GitHub Desktop.
Save thekensta/104e665d3b79becbd57286d14f571998 to your computer and use it in GitHub Desktop.
Load and Extract DATETIMES and TIMESTAMPS in Big Query
#!/usr/bin/env bash
DATASET="<YOUR DATASET HERE>"
TABLE="fun_with_time"
SCHEMA_FILE="/tmp/ts-schema.json"
DATA_FILE="/tmp/ts-data.ndjson"
SQL_FILE="/tmp/query.sql"
BUCKET="<YOUR BUCKET HERE>"
cat <<EOM >${SCHEMA_FILE}
[
{"name": "name", "type": "string"},
{"name": "dt", "type": "datetime"},
{"name": "ts", "type": "timestamp"}
]
EOM
bq rm -f ${DATASET}.${TABLE}
bq mk -tf ${DATASET}.${TABLE} ${SCHEMA_FILE}
cat <<EOM >${DATA_FILE}
{"name": "dt string - 1", "dt": "2017-12-01T12:34:56.789012", "ts": null}
{"name": "dt string - 2", "dt": "2017-12-01T12:34:56.789012+06:00", "ts": null}
{"name": "dt string - 3", "dt": "2017-12-01 12:34:56.789012", "ts": null}
{"name": "dt float - 1", "dt": "1512131696.789012", "ts": null}
{"name": "ts string - 1", "dt": null, "ts": "2017-12-01T12:34:56.789012"}
{"name": "ts float seconds", "dt": null, "ts": 1512131696.789012}
{"name": "ts int seconds", "dt": null, "ts": 1512131696}
{"name": "ts string seconds", "dt": null, "ts": "1512131696"}
{"name": "ts float seconds", "dt": null, "ts": "1512131696.789"}
{"name": "ts string + tz", "dt": null, "ts": "2017-12-01T12:34:56.789012+06:00"}
{"name": "ts string + z", "dt": null, "ts": "2017-12-01T12:34:56.789012Z"}
{"name": "ts string + tz name", "dt": null, "ts": "2017-12-01T12:34:56.789012 Europe/London"}
{"name": "ts string + utc", "dt": null, "ts": "2017-12-01T12:34:56.789012 UTC"}
EOM
# [query] Invalid datetime string "2017-12-01T12:34:56.789012+06:00" Field: dt; Value: 2017-12-01T12:34:56.789012+06:00
# [query] Invalid datetime string "1512131696.789012" Field: dt; Value: 1512131696.789012
# [mediaupload-snapshot] Error while reading data, error message: JSON parsing error in
# row starting at position 772: Couldn't convert value to timestamp:
# Unrecognized timezone: Europe/London Field: ts; Value: 2017-12-01T12:34:56.789012 Europe/London
echo "--- Following data manually excluded as crashed the load process ---"
# These will actually crash the load
#
echo '{"name": "ts int millis", "dt": null, "ts": 1496316896789}'
echo '{"name": "ts int millis", "dt": null, "ts": 1512131696789}'
echo '{"name": "ts int micros", "dt": null, "ts": 1512131696789012}'
echo " --> The range of valid timestamp values is [0001-01-1 00:00:00, 9999-12-31 23:59:59.999999];"
echo " -------------------------------------------------------------------"
#
# Error in query string: Error processing job '<JOB_ID>':
# Cannot return an invalid timestamp value of -501317255171232512 microseconds relative to the Unix
# epoch. The range of valid timestamp values is [0001-01-1 00:00:00, 9999-12-31 23:59:59.999999];
# error in writing field ts
bq load --source_format=NEWLINE_DELIMITED_JSON \
--replace \
--max_bad_records=10 \
${DATASET}.${TABLE} ${DATA_FILE}
cat <<EOM >${SQL_FILE}
select name, dt, ts, unix_millis(ts) as ms,
unix_micros(ts) as micros
from $DATASET.$TABLE
EOM
bq query --use_legacy_sql=False < ${SQL_FILE}
# +-------------------+----------------------------+---------------------+---------------+------------------+
# | name | dt | ts | ms | micros |
# +-------------------+----------------------------+---------------------+---------------+------------------+
# | dt string - 3 | 2017-12-01T12:34:56.789012 | NULL | NULL | NULL |
# | dt string - 1 | 2017-12-01T12:34:56.789012 | NULL | NULL | NULL |
# | ts string seconds | NULL | 2017-12-01 12:34:56 | 1512131696000 | 1512131696000000 |
# | ts int seconds | NULL | 2017-12-01 12:34:56 | 1512131696000 | 1512131696000000 |
# | ts float seconds | NULL | 2017-12-01 12:34:56 | 1512131696789 | 1512131696789000 |
# | ts string + utc | NULL | 2017-12-01 12:34:56 | 1512131696789 | 1512131696789012 |
# | ts float seconds | NULL | 2017-12-01 12:34:56 | 1512131696789 | 1512131696789012 |
# | ts string + z | NULL | 2017-12-01 12:34:56 | 1512131696789 | 1512131696789012 |
# | ts string - 1 | NULL | 2017-12-01 12:34:56 | 1512131696789 | 1512131696789012 |
# | ts string + tz | NULL | 2017-12-01 06:34:56 | 1512110096789 | 1512110096789012 |
# +-------------------+----------------------------+---------------------+---------------+------------------+
bq extract --destination_format=NEWLINE_DELIMITED_JSON \
${DATASET}.${TABLE} gs://${BUCKET}/${TABLE}.json
gsutil cp gs://${BUCKET}/${TABLE}.json /tmp/
# {"name":"dt string - 3","dt":"2017-12-01T12:34:56.789012"}
# {"name":"dt string - 1","dt":"2017-12-01T12:34:56.789012"}
# {"name":"ts string seconds","ts":"2017-12-01 12:34:56 UTC"}
# {"name":"ts int seconds","ts":"2017-12-01 12:34:56 UTC"}
# {"name":"ts float seconds","ts":"2017-12-01 12:34:56.789 UTC"}
# {"name":"ts float seconds","ts":"2017-12-01 12:34:56.789012 UTC"}
# {"name":"ts string + utc","ts":"2017-12-01 12:34:56.789012 UTC"}
# {"name":"ts string + z","ts":"2017-12-01 12:34:56.789012 UTC"}
# {"name":"ts string - 1","ts":"2017-12-01 12:34:56.789012 UTC"}
# {"name":"ts string + tz","ts":"2017-12-01 06:34:56.789012 UTC"}
# Avro Timestamp work around
# https://stackoverflow.com/questions/40651878/compatibility-of-avro-dates-and-times-with-bigquery
bq extract --destination_format=AVRO \
${DATASET}.${TABLE} gs://${BUCKET}/${TABLE}.avro
gsutil cp gs://${BUCKET}/${TABLE}.avro /tmp/
avro-tools getschema /tmp/${TABLE}.avro
# {
# "type" : "record",
# "name" : "Root",
# "fields" : [ {
# "name" : "name",
# "type" : [ "null", "string" ]
# }, {
# "name" : "dt",
# "type" : [ "null", "string" ]
# }, {
# "name" : "ts",
# "type" : [ "null", "long" ]
# } ]
# }
avro-tools tojson /tmp/${TABLE}.avro
# {"name":{"string":"dt string - 3"},"dt":{"string":"2017-12-01T12:34:56.789012"},"ts":null}
# {"name":{"string":"dt string - 1"},"dt":{"string":"2017-12-01T12:34:56.789012"},"ts":null}
# {"name":{"string":"ts string seconds"},"dt":null,"ts":{"long":1512131696000000}}
# {"name":{"string":"ts int seconds"},"dt":null,"ts":{"long":1512131696000000}}
# {"name":{"string":"ts float seconds"},"dt":null,"ts":{"long":1512131696789000}}
# {"name":{"string":"ts float seconds"},"dt":null,"ts":{"long":1512131696789012}}
# {"name":{"string":"ts string + utc"},"dt":null,"ts":{"long":1512131696789012}}
# {"name":{"string":"ts string + z"},"dt":null,"ts":{"long":1512131696789012}}
# {"name":{"string":"ts string - 1"},"dt":null,"ts":{"long":1512131696789012}}
# {"name":{"string":"ts string + tz"},"dt":null,"ts":{"long":1512110096789012}}
echo "-------- Trying to Reload Exported AVRO to Big Query with Replace -------"
echo " --- Current Schema ---"
bq show ${DATASET}.${TABLE}
# Last modified Schema Total Rows Total Bytes Expiration Time Partitioning Labels kmsKeyName
# ----------------- ------------------ ------------ ------------- ------------ ------------------- -------- ------------
# 06 Dec 21:16:06 |- name: string 0 0
# |- dt: datetime
# |- ts: timestamp
# Re-load the extracted AVRO
bq load --source_format=AVRO \
--replace \
--max_bad_records=10 \
${DATASET}.${TABLE} /tmp/${TABLE}.avro
echo " --- Current Schema ---"
bq show ${DATASET}.${TABLE}
# Schema has changed to string, string, integer from the AVRO
#
# Last modified Schema Total Rows Total Bytes Expiration Time Partitioning Labels kmsKeyName
# ----------------- ----------------- ------------ ------------- ------------ ------------------- -------- ------------
# 06 Dec 21:00:38 |- name: string 10 284
# |- dt: string
# |- ts: integer
echo "-------- Querying Replaced Data ---------"
bq query --use_legacy_sql=False < ${SQL_FILE}
# Error in query string: Error processing job '<JOB_ID>': No matching signature for
# function UNIX_MILLIS for argument types: INT64. Supported signature: UNIX_MILLIS(TIMESTAMP) at [1:22]
echo "-------- Recreating Table ---------"
bq rm -f ${DATASET}.${TABLE}
bq mk -t ${DATASET}.${TABLE} ${SCHEMA_FILE}
echo "-------- Reloading AVRO Data WITHOUT replace ---------"
bq load --source_format=AVRO \
--max_bad_records=10 \
${DATASET}.${TABLE} /tmp/${TABLE}.avro
# BigQuery error in load operation: Error processing job 'hx-trial:bqjob_ra05543495ed27af_000001602d98d9fd_1': Provided Schema
# does not match Table hx-trial:chrisk.fun_with_time. Field dt has changed type from DATETIME to STRING.
echo "-------- Reloading AVRO Data WITH EXPLICIT SCHEMA ---------"
bq load --source_format=AVRO \
--max_bad_records=10 \
${DATASET}.${TABLE} \
/tmp/${TABLE}.avro \
${SCHEMA_FILE}
# Error in query string: Error processing job '<JOB_ID>': Cannot read tablet :
# Incompatible types. 'dt' : STRING 'dt' : INT64
bq query --use_legacy_sql=False < ${SQL_FILE}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment