Skip to content

Instantly share code, notes, and snippets.

@abicky
Last active September 9, 2019 07:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save abicky/66242adacfb5fff6bf0243c1a90742b0 to your computer and use it in GitHub Desktop.
Save abicky/66242adacfb5fff6bf0243c1a90742b0 to your computer and use it in GitHub Desktop.
Extract data from JSON using keys in the first level
require 'csv'
require 'faker'
require 'google/cloud/bigquery'
PROJECT_ID = ENV['PROJECT_ID']
DATASET_ID = ENV['DATASET_ID']
TABLE_ID = 'json_extract_benchmark'
bigquery = Google::Cloud::Bigquery.new(project_id: PROJECT_ID)
dataset = bigquery.dataset(DATASET_ID)
table = dataset.table(TABLE_ID) || dataset.create_table(TABLE_ID) do |t|
t.schema do |s|
s.string 'json', mode: :required
s.string 'key', mode: :required
s.string 'value_type', mode: :required
end
end
PROPERTIES = [
['building_number', 'int'],
['latitude', 'float'],
['full_address', 'string'],
]
prng = Random.new(42)
Faker::Config.random = prng
Tempfile.open('json_extract_benchmark') do |f|
csv = CSV.new(f)
1_000_000.times do
key, value_type = PROPERTIES.sample(random: prng)
csv << [
{
building_number: Faker::Address.building_number.to_i,
latitude: Faker::Address.latitude.to_f,
full_address: Faker::Address.full_address,
}.to_json,
key,
value_type,
]
end
f.rewind
table.load(f, format: 'csv', write: 'truncate')
end
sqls = {
json_extract_scalar: <<~'SQL' % TABLE_ID,
#standardSQL
select
case key
when 'building_number' then json_extract_scalar(json, '$.building_number')
when 'latitude' then json_extract_scalar(json, '$.latitude')
when 'full_address' then json_extract_scalar(json, '$.full_address')
end
from
%s
SQL
f_json_extract_scalar: <<~'SQL' % TABLE_ID,
#standardSQL
create temporary function f_json_extract_scalar(json string, key string, value_type string)
returns string as (
case value_type
when 'string' then
json_extract_scalar(
-- key should be escaped in advance
regexp_extract(json, concat('"', key, r'"\s*:\s*("[^"\\]*(?:\\.[^"\\]*)*")'))
, '$'
)
else
-- key should be escaped in advance
regexp_extract(json, concat('"', key, r'"\s*:\s*(-?\d+(?:\.\d+)?(?:[eE]-?\d+)?)'))
end
);
select
f_json_extract_scalar(json, key, value_type)
from
%s
SQL
f_json_extract_scalar_using_js: <<~'SQL' % TABLE_ID,
#standardSQL
create temporary function f_json_extract_scalar_using_js(json string, key string)
returns string
language js as """
return JSON.parse(json)[key];
""";
select
f_json_extract_scalar_using_js(json, key)
from
%s
SQL
}
puts ' Name | Elapsed Time (s) | Total Slot Time (s) '
puts '------|------------------|---------------------'
sqls.each do |name, sql|
job = dataset.query_job(sql, cache: false)
job.wait_until_done!
elapsed_time = job.ended_at - job.started_at
total_slot_time = job.statistics['totalSlotMs'].to_i / 1000
puts [name, elapsed_time, total_slot_time].join(' | ')
end
# Name | Elapsed Time (s) | Total Slot Time (s)
# ------|------------------|---------------------
# json_extract_scalar | 7.642 | 13
# f_json_extract_scalar | 88.171 | 156
# f_json_extract_scalar_using_js | 18.38 | 21
#standardSQL
create temporary function f_json_extract_scalar(json string, key string, value_type string)
returns string as (
case value_type
when 'string' then
json_extract_scalar(
-- key should be escaped in advance
regexp_extract(json, concat('"', key, r'"\s*:\s*("[^"\\]*(?:\\.[^"\\]*)*")'))
, '$'
)
else
-- key should be escaped in advance
regexp_extract(json, concat('"', key, r'"\s*:\s*(-?\d+(?:\.\d+)?(?:[eE]-?\d+)?)'))
end
);
create temporary function f_json_extract_scalar_using_js(json string, key string)
returns string
language js as """
return JSON.parse(json)[key];
""";
with t as (
select '{"abc":1}' as json, 'abc' as key, 'int' as value_type
union all
select '{"abc":-1}' as json, 'abc' as key, 'int' as value_type
union all
select '{"abc":1e10}' as json, 'abc' as key, 'float' as value_type
union all
select '{"abc":1.2e-1}' as json, 'abc' as key, 'float' as value_type
union all
select '{"abc":1E-10}' as json, 'abc' as key, 'float' as value_type
union all
select '{"abc":0.1234}' as json, 'abc' as key, 'float' as value_type
union all
select '{"abc":"string"}' as json, 'abc' as key, 'string' as value_type
union all
select r'{"abc":"s\"tring"}' as json, 'abc' as key, 'string' as value_type
union all
select '{"abc" \t: \n"string"}' as json, 'abc' as key, 'string' as value_type
union all
select r'{"abc":"s\tring"}' as json, 'abc' as key, 'string' as value_type
union all
select r'{"abc":null}' as json, 'abc' as key, 'string' as value_type
), extracted_values as (
select
json
, case value_type
when 'int' then cast(cast(f_json_extract_scalar(json, key, value_type) as int64) as string)
when 'float' then cast(cast(f_json_extract_scalar(json, key, value_type) as float64) as string)
when 'string' then f_json_extract_scalar(json, key, value_type)
end as sql_udf_value
, case value_type
when 'int' then cast(cast(f_json_extract_scalar_using_js(json, key) as int64) as string)
when 'float' then cast(cast(f_json_extract_scalar_using_js(json, key) as float64) as string)
when 'string' then f_json_extract_scalar_using_js(json, key)
end as js_udf_value
from
t
)
select
json
, sql_udf_value
, js_udf_value
, (sql_udf_value = js_udf_value) or (sql_udf_value is null and js_udf_value is null) as equal
from
extracted_values
;
-- +--------------------+---------------+--------------+-------+
-- | json | sql_udf_value | js_udf_value | equal |
-- +--------------------+---------------+--------------+-------+
-- | {"abc":1} | 1 | 1 | true |
-- | {"abc":-1} | -1 | -1 | true |
-- | {"abc":1e10} | 10000000000 | 10000000000 | true |
-- | {"abc":1.2e-1} | 0.12 | 0.12 | true |
-- | {"abc":1E-10} | 1e-10 | 1e-10 | true |
-- | {"abc":0.1234} | 0.1234 | 0.1234 | true |
-- | {"abc":"string"} | string | string | true |
-- | {"abc":"s\"tring"} | s"tring | s"tring | true |
-- | {"abc" : | string | string | true |
-- | "string"} | | | |
-- | {"abc":"s\tring"} | s ring | s ring | true |
-- | {"abc":null} | NULL | NULL | true |
-- +--------------------+---------------+--------------+-------+
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment