Skip to content

Instantly share code, notes, and snippets.

@doit-mattporter
Last active July 14, 2023 23:41
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save doit-mattporter/09dd2777228529eff1c2d672cf892555 to your computer and use it in GitHub Desktop.
Save doit-mattporter/09dd2777228529eff1c2d672cf892555 to your computer and use it in GitHub Desktop.
This Python script uses the Google Cloud BigQuery API to query a specified BigQuery dataset's job history, returning information about the total number of queries made to each table, the users who made those queries, the number of queries each user made, their last query, and the timestamp of their last query, sorted by the number of total queri…
#!/usr/bin/env python3
from google.cloud import bigquery
import pandas as pd
# Specify your dataset
project_id = "your_project_here"
dataset_id = "your_dataset_here"
# Specify how far back you want to look through your job history, up to 180 days
job_history_lookback_days = 2
# Construct a query to get job information from the jobs metadata
query = f"""
WITH table_info AS (
SELECT
CONCAT(t.project_id, ".", t.dataset_id, ".", t.table_id) AS referenced_table,
COUNT(job_id) as number_of_hits
FROM `{project_id}.region-us.INFORMATION_SCHEMA.JOBS`,
UNNEST(referenced_tables) as t
WHERE STATE = 'DONE'
AND creation_time > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL {job_history_lookback_days} DAY)
AND t.dataset_id = '{dataset_id}'
GROUP BY referenced_table
),
user_table_info AS (
SELECT
CONCAT(t.project_id, ".", t.dataset_id, ".", t.table_id) as referenced_table,
user_email,
COUNT(job_id) as user_hits,
ARRAY_AGG(STRUCT(query, creation_time) ORDER BY creation_time DESC LIMIT 1)[OFFSET(0)].query as last_query,
MAX(creation_time) as user_last_hit
FROM `{project_id}.region-us.INFORMATION_SCHEMA.JOBS`,
UNNEST(referenced_tables) as t
WHERE STATE = 'DONE'
AND creation_time > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL {job_history_lookback_days} DAY)
AND t.dataset_id = '{dataset_id}'
GROUP BY referenced_table, user_email
)
SELECT
table_info.referenced_table,
table_info.number_of_hits,
user_table_info.user_email,
user_table_info.user_hits,
user_table_info.last_query,
user_table_info.user_last_hit
FROM table_info
JOIN user_table_info
ON table_info.referenced_table = user_table_info.referenced_table
ORDER BY table_info.number_of_hits DESC, table_info.referenced_table ASC, user_table_info.user_hits DESC
"""
client = bigquery.Client()
table_info_df = (
client.query(query)
.result()
.to_dataframe(create_bqstorage_client=False)
)
print(table_info_df)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment