Skip to content

Instantly share code, notes, and snippets.

@Zatte
Last active January 13, 2022 20:34
Show Gist options
  • Save Zatte/2c84ede654ec20866801004ef85d0864 to your computer and use it in GitHub Desktop.
Save Zatte/2c84ede654ec20866801004ef85d0864 to your computer and use it in GitHub Desktop.
POC - BigQuery UDF generating a JSON Schema string that validates JSON to match a BQ schema,
CREATE TEMP FUNCTION BigQuerySchemaToJsonSchema(src STRING)
RETURNS STRING LANGUAGE js AS """
let expandStrRep = function(subStr){
let pairs = []
let position = 0
let carrotOffset = 0
let state = "col_name"
let pair = []
for(i=0; i<subStr.length; i++){
if (state == "col_name" && subStr[i] == " "){
pair.push(subStr.substr(position, i - position))
position = i +1
state = "col_schema"
}
if (state == "col_schema" && subStr[i] == "<") carrotOffset++
if (state == "col_schema" && subStr[i] == ">") carrotOffset--
if (state == "col_schema" && subStr[i] == "," && carrotOffset == 0){
pair.push(subStr.substr(position, i - position))
position = i +2
i++
pairs.push(pair)
pair = []
state = "col_name"
}
}
if (pair.length > 0){
pair.push(subStr.substr(position))
pairs.push(pair)
}
return pairs
};
let mapTypes = function(col_type){
col_type = col_type.trim()
if(col_type == "DATE")
return {
"type": "string",
"pattern": "^\\\\d\\\\d\\\\d\\\\d-\\\\d\\\\d-\\\\d\\\\d"
}
if(col_type == "DATETIME")
return {
"type": "string",
"pattern": "^\\\\d\\\\d\\\\d\\\\d-\\\\d\\\\d-\\\\d\\\\d(T| )\\\\d\\\\d:\\\\d\\\\d:\\\\d\\\\d.d?.+$"
}
if(col_type == "TIME")
return {
"type": "string",
"pattern": "^\\\\d\\\\d\\\\d\\\\d-\\\\d\\\\d-\\\\d\\\\d(T| )\\\\d\\\\d:\\\\d\\\\d:\\\\d\\\\d.d?.+$"
}
if(col_type == "STRING")
return {"type": "string"}
if(col_type == "NUMERIC")
return {"type": "number"}
if(col_type == "BIGNUMERIC")
return {"type": "number"}
if(col_type == "BOOL")
return {"type": "boolean"}
if(col_type == "FLOAT64")
return {"type": "number"}
// json doens't handle large ints; bigquery handles string numbers.
if(col_type == "INT64")
return {"type": {
"anyOf": [
{ "type": "string", "pattern": "^\\\\d+$"},
{ "type": "number"}
]}
}
if(col_type.indexOf("ARRAY") == 0 || col_type.indexOf("ARRAY") == 1)
return {
"type": "array",
"items": mapTypes(col_type.substr(6,col_type.length-7))
}
if(col_type.indexOf("STRUCT") == 0 || col_type.indexOf("STRUCT") == 1){
var res = {"type": "object", "properties": {}}
var subFields = expandStrRep(col_type.substr(7, col_type.length-8))
subFields.forEach(pair => {
res[pair[0]] = mapTypes(pair[1])
})
return res
}
// Not validated that this is the correct import format
if(col_type == "JSON")
return {"type": "string"}
// Must be base64 encoded.
if(col_type == "BYTES")
return {
"type": "string",
"description": "BigQuery bytes columns must be base64 encoded strings",
"pattern": "^(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)?$"
}
if(col_type == "NUMERIC")
return {"type": "string"}
if(col_type == "GEOGRAPHY")
return {"type": "string"}
if(col_type == "INTERVAL")
return {"type": "string"}
// default falltback
return {"type": "string"}
};
let schema = {
"$schema": "https://json-schema.org/draft/2020-12/schema",
"title": "BigQuery Schema",
"type": "object",
"properties": {
}
};
JSON.parse(src).forEach(column => {
schema.properties[column.column_name] = mapTypes(column.data_type)
})
return JSON.stringify(schema)
""";
SELECT
TABLE_NAME,
BigQuerySchemaToJsonSchema(TO_JSON_STRING(ARRAY_AGG((SELECT AS STRUCT column_name, data_type))))
FROM
`${DATASET}.INFORMATION_SCHEMA.COLUMNS`
WHERE
TABLE_NAME = "${TABLE_NAME}"
GROUP BY
TABLE_NAME
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment