Last active
July 14, 2023 23:37
-
-
Save doit-mattporter/3aff6df56346a8259b9e88babbe3733d to your computer and use it in GitHub Desktop.
Utilizes the Google Cloud BigQuery API to retrieve and display metadata of all tables in a specified BigQuery dataset, including details such as table size, row count, creation time, last modified time, partitioning and clustering information, as well as an estimate of monthly storage cost
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
from google.cloud import bigquery | |
import pandas as pd | |
client = bigquery.Client() | |
# Specify your dataset | |
project_id = "your_project_here" | |
dataset_id = "your_dataset_here" | |
sql = f""" | |
SELECT | |
'{project_id}' as project, | |
'{dataset_id}' as database, | |
t.table_id as tablename, | |
i.table_type, | |
FORMAT_TIMESTAMP('%Y-%m-%d %H:%M:%S UTC', TIMESTAMP_SECONDS(CAST(t.creation_time/1000 AS INT64))) as creation_time, | |
FORMAT_TIMESTAMP('%Y-%m-%d %H:%M:%S UTC', TIMESTAMP_SECONDS(CAST(t.last_modified_time/1000 AS INT64))) as last_modified_time, | |
t.size_bytes, | |
t.size_bytes/(1024*1024*1024) as size_gbs, | |
t.size_bytes/(1024*1024*1024)*0.020 as monthly_cost, | |
t.row_count | |
FROM | |
`{project_id}.{dataset_id}.__TABLES__` t | |
LEFT JOIN | |
`{project_id}.{dataset_id}.INFORMATION_SCHEMA.TABLES` i | |
ON | |
t.table_id = i.table_name | |
""" | |
df = client.query(sql).to_dataframe() | |
# Create a dictionary to store metadata for all tables | |
table_metadata = {} | |
# Function to fetch table metadata | |
def get_table_metadata(row): | |
table = row['tablename'] | |
if table not in table_metadata: | |
project = row['project'] | |
database = row['database'] | |
table_ref = client.dataset(database, project=project).table(table) | |
table_obj = client.get_table(table_ref) | |
table_type = 'Partitioned' if table_obj.time_partitioning else 'Not Partitioned' | |
partitioned_by = table_obj.time_partitioning.field if table_obj.time_partitioning and table_obj.time_partitioning.field else '_PARTITIONTIME' if table_obj.time_partitioning else None | |
clustered_by = ", ".join(table_obj.clustering_fields) if table_obj.clustering_fields else None | |
expiration_date = table_obj.expires | |
table_metadata[table] = {'table_type': table_type, 'partitioned_by': partitioned_by, 'clustered_by': clustered_by, 'expiration_date': expiration_date} | |
return table_metadata[table] | |
# Apply the function to each row in the DataFrame | |
df['metadata'] = df.apply(get_table_metadata, axis=1) | |
# Split the 'metadata' column into separate columns | |
df_metadata = df['metadata'].apply(pd.Series) | |
df = pd.concat([df, df_metadata], axis=1) | |
df.drop(columns=['metadata'], inplace=True) | |
# Format the 'monthly_cost', 'size_gbs', and 'row_count' columns | |
df['monthly_cost'] = df['monthly_cost'].map('${:,.2f}'.format) | |
df['size_gbs'] = df['size_gbs'].map('{:,.2f} GBs'.format) | |
df['row_count'] = df['row_count'].map('{:,}'.format) | |
df = df.sort_values(by='monthly_cost', ascending=False) | |
print(df) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment