Last active
January 13, 2022 20:34
-
-
Save Zatte/2c84ede654ec20866801004ef85d0864 to your computer and use it in GitHub Desktop.
POC - BigQuery UDF generating a JSON Schema string that validates JSON to match a BQ schema,
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CREATE TEMP FUNCTION BigQuerySchemaToJsonSchema(src STRING) | |
RETURNS STRING LANGUAGE js AS """ | |
let expandStrRep = function(subStr){ | |
let pairs = [] | |
let position = 0 | |
let carrotOffset = 0 | |
let state = "col_name" | |
let pair = [] | |
for(i=0; i<subStr.length; i++){ | |
if (state == "col_name" && subStr[i] == " "){ | |
pair.push(subStr.substr(position, i - position)) | |
position = i +1 | |
state = "col_schema" | |
} | |
if (state == "col_schema" && subStr[i] == "<") carrotOffset++ | |
if (state == "col_schema" && subStr[i] == ">") carrotOffset-- | |
if (state == "col_schema" && subStr[i] == "," && carrotOffset == 0){ | |
pair.push(subStr.substr(position, i - position)) | |
position = i +2 | |
i++ | |
pairs.push(pair) | |
pair = [] | |
state = "col_name" | |
} | |
} | |
if (pair.length > 0){ | |
pair.push(subStr.substr(position)) | |
pairs.push(pair) | |
} | |
return pairs | |
}; | |
let mapTypes = function(col_type){ | |
col_type = col_type.trim() | |
if(col_type == "DATE") | |
return { | |
"type": "string", | |
"pattern": "^\\\\d\\\\d\\\\d\\\\d-\\\\d\\\\d-\\\\d\\\\d" | |
} | |
if(col_type == "DATETIME") | |
return { | |
"type": "string", | |
"pattern": "^\\\\d\\\\d\\\\d\\\\d-\\\\d\\\\d-\\\\d\\\\d(T| )\\\\d\\\\d:\\\\d\\\\d:\\\\d\\\\d.d?.+$" | |
} | |
if(col_type == "TIME") | |
return { | |
"type": "string", | |
"pattern": "^\\\\d\\\\d\\\\d\\\\d-\\\\d\\\\d-\\\\d\\\\d(T| )\\\\d\\\\d:\\\\d\\\\d:\\\\d\\\\d.d?.+$" | |
} | |
if(col_type == "STRING") | |
return {"type": "string"} | |
if(col_type == "NUMERIC") | |
return {"type": "number"} | |
if(col_type == "BIGNUMERIC") | |
return {"type": "number"} | |
if(col_type == "BOOL") | |
return {"type": "boolean"} | |
if(col_type == "FLOAT64") | |
return {"type": "number"} | |
// json doens't handle large ints; bigquery handles string numbers. | |
if(col_type == "INT64") | |
return {"type": { | |
"anyOf": [ | |
{ "type": "string", "pattern": "^\\\\d+$"}, | |
{ "type": "number"} | |
]} | |
} | |
if(col_type.indexOf("ARRAY") == 0 || col_type.indexOf("ARRAY") == 1) | |
return { | |
"type": "array", | |
"items": mapTypes(col_type.substr(6,col_type.length-7)) | |
} | |
if(col_type.indexOf("STRUCT") == 0 || col_type.indexOf("STRUCT") == 1){ | |
var res = {"type": "object", "properties": {}} | |
var subFields = expandStrRep(col_type.substr(7, col_type.length-8)) | |
subFields.forEach(pair => { | |
res[pair[0]] = mapTypes(pair[1]) | |
}) | |
return res | |
} | |
// Not validated that this is the correct import format | |
if(col_type == "JSON") | |
return {"type": "string"} | |
// Must be base64 encoded. | |
if(col_type == "BYTES") | |
return { | |
"type": "string", | |
"description": "BigQuery bytes columns must be base64 encoded strings", | |
"pattern": "^(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)?$" | |
} | |
if(col_type == "NUMERIC") | |
return {"type": "string"} | |
if(col_type == "GEOGRAPHY") | |
return {"type": "string"} | |
if(col_type == "INTERVAL") | |
return {"type": "string"} | |
// default falltback | |
return {"type": "string"} | |
}; | |
let schema = { | |
"$schema": "https://json-schema.org/draft/2020-12/schema", | |
"title": "BigQuery Schema", | |
"type": "object", | |
"properties": { | |
} | |
}; | |
JSON.parse(src).forEach(column => { | |
schema.properties[column.column_name] = mapTypes(column.data_type) | |
}) | |
return JSON.stringify(schema) | |
"""; | |
SELECT | |
TABLE_NAME, | |
BigQuerySchemaToJsonSchema(TO_JSON_STRING(ARRAY_AGG((SELECT AS STRUCT column_name, data_type)))) | |
FROM | |
`${DATASET}.INFORMATION_SCHEMA.COLUMNS` | |
WHERE | |
TABLE_NAME = "${TABLE_NAME}" | |
GROUP BY | |
TABLE_NAME |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment