Skip to content

Instantly share code, notes, and snippets.

@mks0ff
Forked from xemuliam/bigquery.md
Created April 24, 2023 16:58
Show Gist options
  • Save mks0ff/a515b1f27e8d09cb63551c13b8d9cda1 to your computer and use it in GitHub Desktop.
Save mks0ff/a515b1f27e8d09cb63551c13b8d9cda1 to your computer and use it in GitHub Desktop.

BigQuery tricks

Data

Click to see script
with
dta as (
select '+++bHyZZlp6KRHJBj5Cw8wd48QA=' Id, timestamp('2019-11-07 06:53:39.632 UTC') Ts union all
select '++0M0+Aoih1hM5jTMgIsubaUNAA=' Id, timestamp('2019-11-01 18:33:53.461 UTC') Ts union all
select '++0M0+Aoih1hM5jTMgIsubaUNAA=' Id, timestamp('2019-11-01 18:33:53.461 UTC') Ts union all
select '++0M0+Aoih1hM5jTMgIsubaUNAA=' Id, timestamp('2019-11-01 18:33:53.461 UTC') Ts union all
select '++0M0+Aoih1hM5jTMgIsubaUNAA=' Id, timestamp('2019-11-01 19:02:06.289 UTC') Ts union all
select '++0tcflwgzx4TPitA9k2zxmSDwA=' Id, timestamp('2020-01-10 10:38:37.647 UTC') Ts union all
select '++18GGRJFjWrQXGjTano9OoZdQA=' Id, timestamp('2019-11-24 14:41:36.888 UTC') Ts union all
select '++18GGRJFjWrQXGjTano9OoZdQA=' Id, timestamp('2019-11-24 14:41:36.888 UTC') Ts union all
select '++18GGRJFjWrQXGjTano9OoZdQA=' Id, timestamp('2019-11-24 14:41:36.888 UTC') Ts union all
select '++18GGRJFjWrQXGjTano9OoZdQA=' Id, timestamp('2019-11-24 14:41:36.888 UTC') Ts union all
select '++18GGRJFjWrQXGjTano9OoZdQA=' Id, timestamp('2019-11-24 14:41:36.888 UTC') Ts union all
select '++18GGRJFjWrQXGjTano9OoZdQA=' Id, timestamp('2019-11-24 14:41:36.888 UTC') Ts union all
select '++18GGRJFjWrQXGjTano9OoZdQA=' Id, timestamp('2019-11-24 14:41:36.888 UTC') Ts union all
select '++3KXoM2AgZSeCq5hEwY+J8hOgA=' Id, timestamp('2019-12-01 12:25:28.552 UTC') Ts union all
select '++4J2DyOyQZzFw/BGfcT6YZIqQA=' Id, timestamp('2019-11-26 21:12:55.759 UTC') Ts union all
select '++6RDSX4S9vqs1US3jwFp3fVdwA=' Id, timestamp('2019-12-12 12:13:50.570 UTC') Ts union all
select '++6RDSX4S9vqs1US3jwFp3fVdwA=' Id, timestamp('2019-12-12 12:20:45.753 UTC') Ts union all
select '++6RDSX4S9vqs1US3jwFp3fVdwA=' Id, timestamp('2019-12-12 12:20:45.753 UTC') Ts union all
select '++6RDSX4S9vqs1US3jwFp3fVdwA=' Id, timestamp('2019-12-12 12:20:45.753 UTC') Ts union all
select '++6RDSX4S9vqs1US3jwFp3fVdwA=' Id, timestamp('2019-12-12 12:20:45.753 UTC') Ts union all
select '++6RDSX4S9vqs1US3jwFp3fVdwA=' Id, timestamp('2019-12-12 12:20:59.668 UTC') Ts union all
select '++6RDSX4S9vqs1US3jwFp3fVdwA=' Id, timestamp('2019-12-12 12:21:24.500 UTC') Ts union all
select '++6RDSX4S9vqs1US3jwFp3fVdwA=' Id, timestamp('2019-12-12 12:21:46.994 UTC') Ts union all
select '++7DBHuUXDUIWhrqhRaE1CsPWAA=' Id, timestamp('2019-11-09 16:30:59.642 UTC') Ts union all
select '++7HKV8uqk1LzCZA++zF312A0gA=' Id, timestamp('2019-11-06 19:23:31.053 UTC') Ts union all
select '++7HKV8uqk1LzCZA++zF312A0gA=' Id, timestamp('2019-11-06 19:23:31.053 UTC') Ts union all
select '++7bMX5DDoYHk1kUARymNbhVPgA=' Id, timestamp('2019-11-09 11:36:30.274 UTC') Ts union all
select '++7uAFsu2/Xx8I7jmViRReX1gQA=' Id, timestamp('2019-11-02 16:38:22.266 UTC') Ts union all
select '++7uAFsu2/Xx8I7jmViRReX1gQA=' Id, timestamp('2019-12-22 21:09:09.089 UTC') Ts union all
select '++A72C1LcnV0JCgcVHSmtEUVSAA=' Id, timestamp('2020-01-20 12:37:58.673 UTC') Ts union all
select '++BrCbtdWCFSGQ9+dI6/GPdeVQA=' Id, timestamp('2019-12-11 18:57:40.341 UTC') Ts union all
select '++CFmtzqz/Xx1lTiLtfXWtH7/gA=' Id, timestamp('2019-12-08 17:58:32.008 UTC') Ts union all
select '++CFmtzqz/Xx1lTiLtfXWtH7/gA=' Id, timestamp('2019-12-13 22:31:37.361 UTC') Ts union all
select '++CRRhHKj+Ei+PNGVxsSQCCp3wA=' Id, timestamp('2019-12-05 17:32:47.022 UTC') Ts union all
select '++CRRhHKj+Ei+PNGVxsSQCCp3wA=' Id, timestamp('2019-12-05 17:32:47.022 UTC') Ts union all
select '++CRRhHKj+Ei+PNGVxsSQCCp3wA=' Id, timestamp('2019-12-08 15:23:20.860 UTC') Ts union all
select '++FHPiYcIuOn5cneYVT0oUkdcAA=' Id, timestamp('2019-11-10 16:35:03.005 UTC') Ts union all
select '++FkfDioL4zTdlF36Y7G/BWxBgA=' Id, timestamp('2019-12-27 20:16:31.882 UTC') Ts union all
select '++FkfDioL4zTdlF36Y7G/BWxBgA=' Id, timestamp('2019-12-27 20:16:53.296 UTC') Ts union all
select '++FkfDioL4zTdlF36Y7G/BWxBgA=' Id, timestamp('2019-12-27 20:16:53.296 UTC') Ts union all
select '++FkfDioL4zTdlF36Y7G/BWxBgA=' Id, timestamp('2019-12-27 20:16:53.296 UTC') Ts union all
select '++FkfDioL4zTdlF36Y7G/BWxBgA=' Id, timestamp('2019-12-27 20:16:57.440 UTC') Ts union all
select '++FkfDioL4zTdlF36Y7G/BWxBgA=' Id, timestamp('2019-12-27 20:17:08.410 UTC') Ts union all
select '++FkfDioL4zTdlF36Y7G/BWxBgA=' Id, timestamp('2019-12-27 20:17:42.254 UTC') Ts union all
select '++FkfDioL4zTdlF36Y7G/BWxBgA=' Id, timestamp('2019-12-27 20:17:57.642 UTC') Ts union all
select '++GyePjsYwUNl+k+Ghpv1fHMzgA=' Id, timestamp('2019-11-10 16:03:49.830 UTC') Ts union all
select '++GyePjsYwUNl+k+Ghpv1fHMzgA=' Id, timestamp('2019-11-10 16:14:31.060 UTC') Ts union all
select '++GyePjsYwUNl+k+Ghpv1fHMzgA=' Id, timestamp('2019-11-10 16:14:31.060 UTC') Ts union all
select '++GyePjsYwUNl+k+Ghpv1fHMzgA=' Id, timestamp('2019-11-10 16:29:05.827 UTC') Ts union all
select '++JjLrnuMJbwCTvmmK51LU+7YwA=' Id, timestamp('2019-11-21 14:33:53.537 UTC') Ts union all
select '++LNN2e6vPYgOdH4pB0OCgnrngA=' Id, timestamp('2020-01-01 22:46:51.868 UTC') Ts union all
select '++Rqo0aHCJ3Lj7Pz/bOJa4zRKwA=' Id, timestamp('2019-11-18 22:42:23.121 UTC') Ts union all
select '++TCog2pOVSPHvz8zn21HcOrGAA=' Id, timestamp('2019-12-07 14:19:15.445 UTC') Ts union all
select '++UGyCMXY4cX1ukZYH6C8Te4OQA=' Id, timestamp('2019-12-29 19:18:13.065 UTC') Ts union all
select '++Ue3GMumJlXpn8gqa44WV45bAA=' Id, timestamp('2019-12-07 16:20:51.104 UTC') Ts union all
select '++Ue3GMumJlXpn8gqa44WV45bAA=' Id, timestamp('2020-01-25 15:57:36.650 UTC') Ts union all
select '++Ue3GMumJlXpn8gqa44WV45bAA=' Id, timestamp('2020-02-01 14:37:37.914 UTC') Ts union all
select '++V8mwPHJeKcKmlE856pvzlE8wA=' Id, timestamp('2019-11-27 18:10:59.150 UTC') Ts union all
select '++XkCBhXnihsdaLhpKarD76H/AA=' Id, timestamp('2019-11-04 20:53:56.066 UTC') Ts union all
select '++Xx+lgS50VGkctI/q0tX1rw7QA=' Id, timestamp('2019-11-23 16:57:27.936 UTC') Ts union all
select '++Xx+lgS50VGkctI/q0tX1rw7QA=' Id, timestamp('2020-01-18 13:13:13.510 UTC') Ts union all
select '++YyP1/4iwyL9KLF0a7tzefsGwA=' Id, timestamp('2019-12-10 13:24:28.414 UTC') Ts union all
select '++ZB7vnEJdm8WqURjg+CeO397QA=' Id, timestamp('2019-11-29 15:49:53.709 UTC') Ts union all
select '++ZB7vnEJdm8WqURjg+CeO397QA=' Id, timestamp('2019-11-29 15:49:53.709 UTC') Ts union all
select '++ZB7vnEJdm8WqURjg+CeO397QA=' Id, timestamp('2019-11-29 15:49:53.709 UTC') Ts union all
select '++ZoCUPqez7g5PKU/MhVddbF5AA=' Id, timestamp('2019-12-16 10:55:04.973 UTC') Ts union all
select '++ZoCUPqez7g5PKU/MhVddbF5AA=' Id, timestamp('2019-12-16 10:55:04.973 UTC') Ts union all
select '++ZoCUPqez7g5PKU/MhVddbF5AA=' Id, timestamp('2019-12-16 10:55:04.973 UTC') Ts
)
select count(1) RowCount, count(distinct concat(Id, cast(Ts as string))) UniqCount, count(distinct Id) UniqIdCount
from dta
group by Id, Ts

Deduplication of data

De-dupe by Google advice

select * except (rn) from (
select *,
row_number() over (partition by Id, Ts order by Ts) rn
from dta
) where rn=1
order by Id, Ts

De-dupe by ARRAY_AGG

select data.* from (
select array_agg(dta order by Ts limit 1)[offset(0)] data
from dta
group by Id, Ts
) order by Id, Ts

De-dupe using complex fileds like structures and arrays

Let's assume you need to make de-duplication using compound fields like structures or arrays. Approach wiil be similar to above one, however we'll be using following trick

ROW_NUMBER() OVER (PARTITION BY SHA512(TO_JSON_STRING((field1, field2.subfield5, field3, fieldN))) ORDER BY field6 desc)

it is the same as

ROW_NUMBER() OVER (PARTITION BY SHA512(TO_JSON_STRING(struct(field1, field2.subfield5, field3, fieldN))) ORDER BY field6 desc)

IN operator using set of fields

select * from A where (A.f1, A.f3, A.fN) IN (
  select (q1, q2, q3) from B
)

LEAD analythical function

LEAD to make SCD type 2

dev as (
select Id, Ts
from dta
group by 1, 2
)
select Id, 
  Ts as EffectiveFrom,
  coalesce(timestamp_sub(lead(Ts) over (partition by Id order by Ts), interval 1 SECOND), TIMESTAMP ('9999-12-31 23:59:59.999')) as EffectiveTo
from dev
order by 1, 2

ARRAY_AGG to make SCD type 2

dev as (
select Id, array_agg(distinct Ts ORDER BY Ts) actTs
from dta
group by Id
)
select Id,
  actTs[safe_offset(pos)] EffectiveFrom,
  ifnull(timestamp_sub(actTs[safe_offset(pos+1)], interval 1 second), TIMESTAMP ('9999-12-31 23:59:59.999')) EffectiveTo
from dev
join unnest(dev.actTs) lines with offset pos
order by 1, 2

SQL generator

with query as (
select 0 pos, ["dta as ("] line
union all
select rwnm, if(rwnm <> rwcnt, array_concat(txt, ["union all"]), array_concat(txt, [")"])) from (
  select [
      "select '" || project_id || "' p, '" || dataset_id || "' d, '" || table_id || "' t, date(DateField) dt",
      "from `"   || project_id || "."      || dataset_id || "."      || table_id || "`",
      "where true",
      "group by date(DateField)"
    ] txt,
    row_number() over (order by table_id) rwnm,
    count(1) over () rwcnt
  from `PROJECT_NAME.DATASET_NAME.__TABLES_SUMMARY__`)
union all
select 999999, [
  "select t, dt, pdt, date_diff(dt, pdt, DAY) dff from (",
  "select t, dt, lag(dt) over (partition by t order by dt) pdt",
  "FROM dta",
  "group by t, dt",
  ") where date_diff(dt, pdt, DAY) > 1"
])
select `with` from query join unnest(line) as `with` order by pos

Projects comparison

SELECT
  o.project_id OldProject,
  o.table_id OldTable,
  format("%'d", o.row_count) OldRowCount,
  concat(format("%'d", CAST((n.row_count-o.row_count)/
  IF(o.row_count=0, 1,
     o.row_count)*100 AS int64)), '%') DiffPercent,
  format("%'d", n.row_count) NewRowCount,
  n.table_id NewTable,
  n.project_id NewProject
FROM
  `OLD-PROJECT-ID.DATASET_ID.__TABLES__` o
JOIN
  `NEW-PROJECT-ID.DATASET_ID.__TABLES__` n
ON
  o.table_id = n.table_id

External tables file names

SELECT ActivityDate, ARRAY_AGG(FileName) FileList, count(FileName) FileCount
FROM (
 SELECT DATE(TIMESTAMP(activitytimestamp)) ActivityDate,
   SUBSTR(SPLIT(_FILE_NAME, '/')[OFFSET(5)], 1, 15) FileName
 FROM `PROJECT-ID.DATASET_ID.EXRTERNAL_TABLE_ID`
 GROUP BY 1, 2 ORDER BY 1, 2
) GROUP BY 1
having array_length(FileList) > 1
ORDER BY 3 desc, 1 DESC

Human-readable numbers

create temp function human_readable_count(count numeric) as (
case
when count/pow(10, 15) >= 1 then round(count/pow(10, 15)) || 'Q+'
when count/pow(10, 12) >= 1 then round(count/pow(10, 12)) || 'T+'
when count/pow(10, 9)  >= 1 then round(count/pow(10, 9))  || 'B+'
when count/pow(10, 6)  >= 1 then round(count/pow(10, 6))  || 'M+'
when count/pow(10, 3)  >= 1 then round(count/pow(10, 3))  || 'K+'
else cast(count as string)
end);

create temp function human_readable_size(size numeric) as (
case
when size/pow(2, 50) >= 1 then round(size/pow(2, 50)) || 'PB+'
when size/pow(2, 40) >= 1 then round(size/pow(2, 40)) || 'TB+'
when size/pow(2, 30) >= 1 then round(size/pow(2, 30)) || 'GB+'
when size/pow(2, 20) >= 1 then round(size/pow(2, 20)) || 'MB+'
when size/pow(2, 10) >= 1 then round(size/pow(2, 10)) || 'KB+'
else cast(size as string)
end);

select ifnull(data_set, ' Total:') DatasetName, human_readable_count(sum(row_cnt)) RowCount, human_readable_size(sum(data_amnt)) SizeOnDisk from (
select 'wikipedia' data_set, sum(row_count) row_cnt, sum(size_bytes) data_amnt from `bigquery-public-data.wikipedia.__TABLES__`
where table_id not like 'wiki%'
union all
select 'stackoverflow' data_set, sum(row_count), sum(size_bytes) from `bigquery-public-data.stackoverflow.__TABLES__`
where table_id not like 'post%' and table_id <> 'stackoverflow_posts'
) group by rollup (data_set) order by data_set

EXECUTE IMMEDIATE

Querying data using table ane as parameter

EXECUTE IMMEDIATE format("""
select count(1) from `%s` where datehour is not null
""", 'bigquery-public-data.wikipedia.pageviews_2019')

Row counts and last ingest time from all dataset tables

Long script

begin
  declare i default 0;
  declare query default '';
  declare tables array<struct<project_id string, dataset_id string, table_id string>>;

  set tables = (
    select array_agg(struct(project_id, dataset_id, table_id))
    from `bigquery-public-data.wikipedia.__TABLES__`
    where regexp_contains(table_id, r'pageviews.(201).')
  );
  
  while i < array_length(tables) do
    set query = query||'select "'||tables[offset(i)].project_id||'.'||tables[offset(i)].dataset_id||'.'||tables[offset(i)].table_id||
      '" TableName, timestamp_trunc(datehour, month) InsertMonth, count(1) RowCount from `'||
      tables[offset(i)].project_id||'.'||tables[offset(i)].dataset_id||'.'||tables[offset(i)].table_id||'`'||
      ' where datehour between timestamp("2016-01-01") and timestamp("2018-12-01") group by 2';
    if i < array_length(tables) - 1 then
      set query = query || ' union all ';
    end if;
    set i = i + 1;
  end while;
  
  set query = "select * from ("||query||") order by 1, 2 desc";

  execute immediate query;
end

Short script

execute immediate "select * from ("||
((select string_agg("""
select '"""||project_id||"."||dataset_id||"."||table_id||"""' TableName,
timestamp_trunc(datehour, month) InsertMonth, count(1) RowCount
from `"""||project_id||"."||dataset_id||"."||table_id||"""`
where datehour between timestamp('2016-01-01') and timestamp('2018-12-01')
group by 2""", """
union all""") from `bigquery-public-data.wikipedia.__TABLES__`
where type = 1 and regexp_contains(table_id, r'pageviews.(201).')))||"""
) order by 1, 2 desc"""

BigQuery audit

SQL part

select project_id, user_email, job_type, statement_type,
min(creation_time) begin_period, max(creation_time) end_period,
sum(total_slot_ms)/(1000*60*60*24) total_slot_hour,
sum(total_bytes_processed)/(1024*1024*1024*1024) total_processed_tb,
timestamp_diff(max(end_time), min(start_time), hour) total_duration_hour
from `region-eu.INFORMATION_SCHEMA.JOBS_BY_PROJECT`
where creation_time between timestamp(date_sub(current_date, interval 30 day)) and timestamp(current_date())
and job_type = 'QUERY'
and statement_type is not null
group by project_id, user_email, job_type,
statement_type
order by total_processed_tb desc

How to know who created a table (Logs Explorer)

resource.type = ("bigquery_project" OR "bigquery_dataset")
logName="projects/zoral-sandbox/logs/cloudaudit.googleapis.com%2Factivity"
-- protoPayload.methodName = "google.cloud.bigquery.v2.TableService.InsertTable"
protoPayload.resourceName = "projects/YOUR-PROJECT/datasets/YOUR_DATASET/tables/YOUR_TABLE"

Arrays creation

select 'molly' name, array(
  select struct('tiny' as name, 'snake' as kind, 7 as age) union all
  select struct('peebles' as name, 'frog' as kind, 3 as age) union all
  select struct('gorgeos' as name, 'peacock' as kind, 1 as age) union all
  select struct('fat' as name, 'rat' as kind, 2 as age) union all
  select struct('poor' as name, 'eagle' as kind, 17 as age)
  ) pets, 9 views
select 'jack' name, [
  struct('shining' as name, 'snake' as kind, 11 as age),
  ('dark', 'lizard', 15),
  ('rainbow', 'pony', 21),
  ('lucky', 'dog', 5),
  ('happy', 'octopus', 87)
  ] pets, 27 views

TRANSACTIONS

There is no transactions on BigQuery side. However there are atomic statements.

Emulation of transaction-like upsert (delete-insert) within single atomic statement

declare partitions default (select array_agg(distinct partition_fld) from target_table);

merge into target_table t
using (select * from source_table cross join unnest([true, false]) to_be_inserted) s
on t.partition_fld in unnest(partitions)
  and t.id = s.id and not to_be_inserted
when matched then
  delete
when not matched and to_be_inserted then
  insert row
;

Get active partitions count

execute immediate (
select string_agg("""
select table_schema, table_name, count(1) active_pertition_count
from `"""||catalog_name||'.'||schema_name||""".INFORMATION_SCHEMA.PARTITIONS`
where storage_tier = 'ACTIVE' group by 1, 2""", ' union all') || """
order by 3 desc, 1, 2"""
from `region-us.INFORMATION_SCHEMA.SCHEMATA`
)

Check whether you need to look into streaming table data instead of partitions metadata

select
  table_name,
  cnt, array_length(arr),
  cnt > ifnull(array_length(arr), 0) need_to_scan_real_data
from (
  select
    table_name,
    count(1) cnt, array_agg(parse_date('%Y%m%d', if(regexp_contains(partition_id, r'^\d{8}$'), partition_id, null)) ignore nulls) arr
  from `YOUR_DATASET.INFORMATION_SCHEMA.PARTITIONS`
  where true
    and storage_tier = 'ACTIVE'
    and last_modified_time > timestamp(current_date - 3)
  group by table_name
)
where cnt > ifnull(array_length(arr), 0)
order by 4 desc, 1

Random fact table generator

create or replace table `YOUR_TABLE` (
    event_id string,
    event_ts timestamp
);

for i in (select * from unnest(generate_array(1, 100))) do
    insert into `YOUR_TABLE`
        with dta as (
        select
            generate_uuid() event_id,
            generate_timestamp_array(
                timestamp(current_date - cast(rand()*100 as int64)),
                current_timestamp, interval 30+cast(rand()*60 as int64) second
            ) event_ts_arr
        )
        select event_id, event_ts
        from dta join unnest(event_ts_arr) event_ts;
end for;

Delete all region-based datasets from project cascade

for ds in (select schema_name val from `region-us.INFORMATION_SCHEMA.SCHEMATA`) do
    begin
        execute immediate "drop schema "||ds.val||" cascade";
    exception when error then
    end;
end for;

Delete all empty datasets from project

declare fire_it default false
;
for schema in (
  select schema_name name
  from `region-eu.INFORMATION_SCHEMA.SCHEMATA`
) do
  execute immediate '''select
    (select count(1) from `'''||schema.name||'''.__TABLES__`) = 0
  ''' into fire_it;
  if fire_it then
    execute immediate "drop schema if exists "||schema.name;
  end if;
end for;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment