Last active
July 14, 2023 23:41
-
-
Save doit-mattporter/09dd2777228529eff1c2d672cf892555 to your computer and use it in GitHub Desktop.
This Python script uses the Google Cloud BigQuery API to query a specified BigQuery dataset's job history, returning information about the total number of queries made to each table, the users who made those queries, the number of queries each user made, their last query, and the timestamp of their last query, sorted by the number of total queri…
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
from google.cloud import bigquery | |
import pandas as pd | |
# Specify your dataset | |
project_id = "your_project_here" | |
dataset_id = "your_dataset_here" | |
# Specify how far back you want to look through your job history, up to 180 days | |
job_history_lookback_days = 2 | |
# Construct a query to get job information from the jobs metadata | |
query = f""" | |
WITH table_info AS ( | |
SELECT | |
CONCAT(t.project_id, ".", t.dataset_id, ".", t.table_id) AS referenced_table, | |
COUNT(job_id) as number_of_hits | |
FROM `{project_id}.region-us.INFORMATION_SCHEMA.JOBS`, | |
UNNEST(referenced_tables) as t | |
WHERE STATE = 'DONE' | |
AND creation_time > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL {job_history_lookback_days} DAY) | |
AND t.dataset_id = '{dataset_id}' | |
GROUP BY referenced_table | |
), | |
user_table_info AS ( | |
SELECT | |
CONCAT(t.project_id, ".", t.dataset_id, ".", t.table_id) as referenced_table, | |
user_email, | |
COUNT(job_id) as user_hits, | |
ARRAY_AGG(STRUCT(query, creation_time) ORDER BY creation_time DESC LIMIT 1)[OFFSET(0)].query as last_query, | |
MAX(creation_time) as user_last_hit | |
FROM `{project_id}.region-us.INFORMATION_SCHEMA.JOBS`, | |
UNNEST(referenced_tables) as t | |
WHERE STATE = 'DONE' | |
AND creation_time > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL {job_history_lookback_days} DAY) | |
AND t.dataset_id = '{dataset_id}' | |
GROUP BY referenced_table, user_email | |
) | |
SELECT | |
table_info.referenced_table, | |
table_info.number_of_hits, | |
user_table_info.user_email, | |
user_table_info.user_hits, | |
user_table_info.last_query, | |
user_table_info.user_last_hit | |
FROM table_info | |
JOIN user_table_info | |
ON table_info.referenced_table = user_table_info.referenced_table | |
ORDER BY table_info.number_of_hits DESC, table_info.referenced_table ASC, user_table_info.user_hits DESC | |
""" | |
client = bigquery.Client() | |
table_info_df = ( | |
client.query(query) | |
.result() | |
.to_dataframe(create_bqstorage_client=False) | |
) | |
print(table_info_df) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment