Skip to content

Instantly share code, notes, and snippets.

@geminixiang
Last active July 10, 2023 01:41
Show Gist options
  • Save geminixiang/93042b06131e416a6299dee03e060b60 to your computer and use it in GitHub Desktop.
Save geminixiang/93042b06131e416a6299dee03e060b60 to your computer and use it in GitHub Desktop.
Top N table size of BigQuery
# ref. https://stackoverflow.com/a/59633371
# Before using
# pip install oauth2client google-api-python-client google-cloud-bigquery
# export GOOGLE_APPLICATION_CREDENTIALS=<CREDENTIALS>.json
from google.cloud import bigquery
from google.cloud.bigquery import Dataset
from googleapiclient import discovery
from oauth2client.client import GoogleCredentials
# Leverage the Application Default Credentials for authentication
credentials = GoogleCredentials.get_application_default()
service = discovery.build("cloudresourcemanager", "v1", credentials=credentials)
# List projects
request = service.projects().list()
response = request.execute()
# Main loop to list projects
for project in response.get("projects", []):
try:
client = bigquery.Client(
project["projectId"]
) # Start the client in the right project
# Loop to list datasets
datasets = list(client.list_datasets())
if datasets: # If there is some BQ dataset
print("Datasets in project {}:".format(project["projectId"]))
# Loop to list the tables in each dataset
max_tables = []
for dataset in datasets:
print(" - {}".format(dataset.dataset_id))
get_sizeGiB = client.query(
"select table_id, (size_bytes /1073741824) as sizeGiB from "
+ dataset.dataset_id
+ ".__TABLES__"
) # This query retrieves all the tables in the dataset and the size in GiB. It can be modified to pull more fields.
tables = get_sizeGiB.result() # Get the result
# Loop to list the tables and print the size
for table in tables:
print("\t{}: {} GiB".format(table.table_id, table.sizeGiB))
max_tables.append(
(dataset.dataset_id, table.table_id, table.sizeGiB)
)
if max_tables:
max_tables.sort(
key=lambda x: x[2], reverse=True
) # Sort the max_tables list based on table size in descending order
print("Max table sizes in project {}:".format(project["projectId"]))
for i in range(
min(10, len(max_tables))
): # Print the top 5 table sizes or less if there are fewer than 5
dataset_id, table_id, table_size = max_tables[i]
print("\t{}.{}: {} GiB".format(dataset_id, table_id, table_size))
else:
print("{} project does not contain any datasets.".format(projectId))
except Exception:
pass
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment