doit-mattporter/dataset_tables_metadata.py

## dataset_tables_metadata.py
#!/usr/bin/env python3

from google.cloud import bigquery
import pandas as pd

client = bigquery.Client()

# Specify your dataset
project_id = "your_project_here"
dataset_id = "your_dataset_here"

sql = f"""
SELECT
  '{project_id}' as project,
  '{dataset_id}' as database,
  t.table_id as tablename,
  i.table_type,
  FORMAT_TIMESTAMP('%Y-%m-%d %H:%M:%S UTC', TIMESTAMP_SECONDS(CAST(t.creation_time/1000 AS INT64))) as creation_time,
  FORMAT_TIMESTAMP('%Y-%m-%d %H:%M:%S UTC', TIMESTAMP_SECONDS(CAST(t.last_modified_time/1000 AS INT64))) as last_modified_time,
  t.size_bytes,
  t.size_bytes/(1024*1024*1024) as size_gbs,
  t.size_bytes/(1024*1024*1024)*0.020 as monthly_cost,
  t.row_count
FROM
  `{project_id}.{dataset_id}.__TABLES__` t
LEFT JOIN
  `{project_id}.{dataset_id}.INFORMATION_SCHEMA.TABLES` i
ON
  t.table_id = i.table_name
"""

df = client.query(sql).to_dataframe()

# Create a dictionary to store metadata for all tables
table_metadata = {}

# Function to fetch table metadata
def get_table_metadata(row):
    table = row['tablename']
    if table not in table_metadata:
        project = row['project']
        database = row['database']
        table_ref = client.dataset(database, project=project).table(table)
        table_obj = client.get_table(table_ref)
        table_type = 'Partitioned' if table_obj.time_partitioning else 'Not Partitioned'
        partitioned_by = table_obj.time_partitioning.field if table_obj.time_partitioning and table_obj.time_partitioning.field else '_PARTITIONTIME' if table_obj.time_partitioning else None
        clustered_by = ", ".join(table_obj.clustering_fields) if table_obj.clustering_fields else None
        expiration_date = table_obj.expires
        table_metadata[table] = {'table_type': table_type, 'partitioned_by': partitioned_by, 'clustered_by': clustered_by, 'expiration_date': expiration_date}

    return table_metadata[table]

# Apply the function to each row in the DataFrame
df['metadata'] = df.apply(get_table_metadata, axis=1)

# Split the 'metadata' column into separate columns
df_metadata = df['metadata'].apply(pd.Series)
df = pd.concat([df, df_metadata], axis=1)
df.drop(columns=['metadata'], inplace=True)

# Format the 'monthly_cost', 'size_gbs', and 'row_count' columns
df['monthly_cost'] = df['monthly_cost'].map('${:,.2f}'.format)
df['size_gbs'] = df['size_gbs'].map('{:,.2f} GBs'.format)
df['row_count'] = df['row_count'].map('{:,}'.format)

df = df.sort_values(by='monthly_cost', ascending=False)

print(df)
	#!/usr/bin/env python3

	from google.cloud import bigquery
	import pandas as pd

	client = bigquery.Client()

	# Specify your dataset
	project_id = "your_project_here"
	dataset_id = "your_dataset_here"

	sql = f"""
	SELECT
	'{project_id}' as project,
	'{dataset_id}' as database,
	t.table_id as tablename,
	i.table_type,
	FORMAT_TIMESTAMP('%Y-%m-%d %H:%M:%S UTC', TIMESTAMP_SECONDS(CAST(t.creation_time/1000 AS INT64))) as creation_time,
	FORMAT_TIMESTAMP('%Y-%m-%d %H:%M:%S UTC', TIMESTAMP_SECONDS(CAST(t.last_modified_time/1000 AS INT64))) as last_modified_time,
	t.size_bytes,
	t.size_bytes/(102410241024) as size_gbs,
	t.size_bytes/(102410241024)*0.020 as monthly_cost,
	t.row_count
	FROM
	`{project_id}.{dataset_id}.__TABLES__` t
	LEFT JOIN
	`{project_id}.{dataset_id}.INFORMATION_SCHEMA.TABLES` i
	ON
	t.table_id = i.table_name
	"""

	df = client.query(sql).to_dataframe()

	# Create a dictionary to store metadata for all tables
	table_metadata = {}

	# Function to fetch table metadata
	def get_table_metadata(row):
	table = row['tablename']
	if table not in table_metadata:
	project = row['project']
	database = row['database']
	table_ref = client.dataset(database, project=project).table(table)
	table_obj = client.get_table(table_ref)
	table_type = 'Partitioned' if table_obj.time_partitioning else 'Not Partitioned'
	partitioned_by = table_obj.time_partitioning.field if table_obj.time_partitioning and table_obj.time_partitioning.field else '_PARTITIONTIME' if table_obj.time_partitioning else None
	clustered_by = ", ".join(table_obj.clustering_fields) if table_obj.clustering_fields else None
	expiration_date = table_obj.expires
	table_metadata[table] = {'table_type': table_type, 'partitioned_by': partitioned_by, 'clustered_by': clustered_by, 'expiration_date': expiration_date}

	return table_metadata[table]

	# Apply the function to each row in the DataFrame
	df['metadata'] = df.apply(get_table_metadata, axis=1)

	# Split the 'metadata' column into separate columns
	df_metadata = df['metadata'].apply(pd.Series)
	df = pd.concat([df, df_metadata], axis=1)
	df.drop(columns=['metadata'], inplace=True)

	# Format the 'monthly_cost', 'size_gbs', and 'row_count' columns
	df['monthly_cost'] = df['monthly_cost'].map('${:,.2f}'.format)
	df['size_gbs'] = df['size_gbs'].map('{:,.2f} GBs'.format)
	df['row_count'] = df['row_count'].map('{:,}'.format)

	df = df.sort_values(by='monthly_cost', ascending=False)

	print(df)