Skip to content

Instantly share code, notes, and snippets.

@doit-mattporter
Last active July 14, 2023 23:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save doit-mattporter/3aff6df56346a8259b9e88babbe3733d to your computer and use it in GitHub Desktop.
Save doit-mattporter/3aff6df56346a8259b9e88babbe3733d to your computer and use it in GitHub Desktop.
Utilizes the Google Cloud BigQuery API to retrieve and display metadata of all tables in a specified BigQuery dataset, including details such as table size, row count, creation time, last modified time, partitioning and clustering information, as well as an estimate of monthly storage cost
#!/usr/bin/env python3
from google.cloud import bigquery
import pandas as pd
client = bigquery.Client()
# Specify your dataset
project_id = "your_project_here"
dataset_id = "your_dataset_here"
sql = f"""
SELECT
'{project_id}' as project,
'{dataset_id}' as database,
t.table_id as tablename,
i.table_type,
FORMAT_TIMESTAMP('%Y-%m-%d %H:%M:%S UTC', TIMESTAMP_SECONDS(CAST(t.creation_time/1000 AS INT64))) as creation_time,
FORMAT_TIMESTAMP('%Y-%m-%d %H:%M:%S UTC', TIMESTAMP_SECONDS(CAST(t.last_modified_time/1000 AS INT64))) as last_modified_time,
t.size_bytes,
t.size_bytes/(1024*1024*1024) as size_gbs,
t.size_bytes/(1024*1024*1024)*0.020 as monthly_cost,
t.row_count
FROM
`{project_id}.{dataset_id}.__TABLES__` t
LEFT JOIN
`{project_id}.{dataset_id}.INFORMATION_SCHEMA.TABLES` i
ON
t.table_id = i.table_name
"""
df = client.query(sql).to_dataframe()
# Create a dictionary to store metadata for all tables
table_metadata = {}
# Function to fetch table metadata
def get_table_metadata(row):
table = row['tablename']
if table not in table_metadata:
project = row['project']
database = row['database']
table_ref = client.dataset(database, project=project).table(table)
table_obj = client.get_table(table_ref)
table_type = 'Partitioned' if table_obj.time_partitioning else 'Not Partitioned'
partitioned_by = table_obj.time_partitioning.field if table_obj.time_partitioning and table_obj.time_partitioning.field else '_PARTITIONTIME' if table_obj.time_partitioning else None
clustered_by = ", ".join(table_obj.clustering_fields) if table_obj.clustering_fields else None
expiration_date = table_obj.expires
table_metadata[table] = {'table_type': table_type, 'partitioned_by': partitioned_by, 'clustered_by': clustered_by, 'expiration_date': expiration_date}
return table_metadata[table]
# Apply the function to each row in the DataFrame
df['metadata'] = df.apply(get_table_metadata, axis=1)
# Split the 'metadata' column into separate columns
df_metadata = df['metadata'].apply(pd.Series)
df = pd.concat([df, df_metadata], axis=1)
df.drop(columns=['metadata'], inplace=True)
# Format the 'monthly_cost', 'size_gbs', and 'row_count' columns
df['monthly_cost'] = df['monthly_cost'].map('${:,.2f}'.format)
df['size_gbs'] = df['size_gbs'].map('{:,.2f} GBs'.format)
df['row_count'] = df['row_count'].map('{:,}'.format)
df = df.sort_values(by='monthly_cost', ascending=False)
print(df)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment