Intelrunner/updateDatasetByProject.py

## updateDatasetByProject.py
"""
    This (Google Cloud) Function will scan all a list of provided projects and get all Bigquery datasets for each of those projects.

    It will then update the storage billing model for each dataset to be 'PHYSICAL' instead of 'LOGICAL'. This is useful for large datasets that are frequently accessed, as it can reduce costs.

    Required Permissions:
    - The service account running this function must have the following roles:
     to be updated---
    Returns:
        string: A message indicating the function has completed.
"""


from google.cloud import storage
from google.cloud import bigquery
from google.cloud import resourcemanager_v3 as resourcemanager

import functions_framework
import csv


"""
This function can work in 1 of 2 ways:
- It can be triggered by an HTTP request, which is useful for testing and manual execution. In that case, the function will need to be deployed with an HTTP trigger and will require the following args, on each deployment:
- project_list_bucket: The name of the bucket where the CSV file containing the list of projects is stored.
- project_list_blob: The name of the CSV file containing the list of projects. This file should contain a list of projects, with each project on a new line. The CSV file should have a header row with the column name 'project_id'.

>>>THIS IS THE WAY THIS GIST IS CURRENTLY CONFIGURED<<<<

Additionally, you can use the get_projects_from_resource_manager() function to list all projects in the organization. This is useful for testing and manual execution, but it is not recommended for production use, as it will require the function to have the 'resourcemanager.projects.list' permission, which is a very broad permission.

This function is not currently used in the main logic flow and merely exists outside and prints the list of projects in the organization that it has access to with it's current permissions.


"""


@functions_framework.http
def bigquery_dataset_review_update(request):
    """
    Updates the billing model of datasets in BigQuery based on a CSV file.

    Args:
        request (flask.Request): The HTTP request object.

    Returns:
        str: The result message indicating the completion status of the update.
    """
    request_json = request.get_json(silent=True)
    request_args = request.args
    print(f"Request JSON: {request_json}")
    print(f"Request args: {request_args}")

    project_list_bucket = request.args.get('project_list_bucket')
    project_list_blob = request.args.get('project_list_blob')

    if not project_list_bucket or not project_list_blob:
        return 'Missing required parameters: project_list_bucket and project_list_blob', 400

    # Initialize clients once
    storage_client = storage.Client()
    bigquery_client = bigquery.Client()

    # Step one: Download CSV of projects from bucket
    try:
        destination_file_name = './projects.csv'
        download_csv_from_bucket(storage_client, project_list_bucket, project_list_blob, destination_file_name)
    except Exception as e:
        return f'Failed to download CSV: {e}', 500

    # Step two: Read CSV and get list of projects
    try:
        projects = read_csv_to_list(destination_file_name)
    except Exception as e:
        return f'Failed to read projects CSV: {e}', 500

    # Step three: Iterate over projects and list datasets
    for project in projects:
        try:
            project_id = project['project_id']  # Adjust based on your CSV columns
            datasets = list_datasets_in_project(bigquery_client, project_id)
            for dataset in datasets:
                dataset_id = dataset.dataset_id
                full_dataset_id = f"{project_id}.{dataset_id}"
                alter_dataset_storage_billing_model(bigquery_client, full_dataset_id, 'PHYSICAL')
        except Exception as e:
            # Log the error but continue processing other projects
            print(f"Error processing project {project_id}: {e}")

    return 'Completed updating datasets billing model.', 200

def download_csv_from_bucket(storage_client, bucket_name, source_blob_name, destination_file_name):
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(source_blob_name)
    blob.download_to_filename(destination_file_name)
    print(f"Blob {source_blob_name} downloaded to {destination_file_name}.")

def read_csv_to_list(file_path):
    with open(file_path, mode='r', encoding='utf-8') as file:
        csv_reader = csv.DictReader(file)
        return [row for row in csv_reader]

def list_datasets_in_project(bigquery_client, project_id):
    datasets = list(bigquery_client.list_datasets(project=project_id))
    print(datasets)
    return datasets

def alter_dataset_storage_billing_model(bigquery_client, dataset_name, billing_model):
    query = f"""
    ALTER SCHEMA `{dataset_name}`
    SET OPTIONS(
        storage_billing_model = '{billing_model}'
    );
    """
    bigquery_client.query(query).result()
    print(f"Dataset '{dataset_name}' updated to use the '{billing_model}' billing model.")
    get_projects_from_resource_manager()

# This function is not currently used in the main logic flow and merely exists outside and prints the list of projects in the organization that it has access to with it's current permissions.

def get_projects_from_resource_manager():
    client = resourcemanager.ProjectsClient()
# List all projects you have access to
    for project in client.list_projects():
        print(project)
	"""
	This (Google Cloud) Function will scan all a list of provided projects and get all Bigquery datasets for each of those projects.

	It will then update the storage billing model for each dataset to be 'PHYSICAL' instead of 'LOGICAL'. This is useful for large datasets that are frequently accessed, as it can reduce costs.

	Required Permissions:
	- The service account running this function must have the following roles:
	to be updated---
	Returns:
	string: A message indicating the function has completed.
	"""



	from google.cloud import storage
	from google.cloud import bigquery
	from google.cloud import resourcemanager_v3 as resourcemanager

	import functions_framework
	import csv


	"""
	This function can work in 1 of 2 ways:
	- It can be triggered by an HTTP request, which is useful for testing and manual execution. In that case, the function will need to be deployed with an HTTP trigger and will require the following args, on each deployment:
	- project_list_bucket: The name of the bucket where the CSV file containing the list of projects is stored.
	- project_list_blob: The name of the CSV file containing the list of projects. This file should contain a list of projects, with each project on a new line. The CSV file should have a header row with the column name 'project_id'.

	>>>THIS IS THE WAY THIS GIST IS CURRENTLY CONFIGURED<<<<

	Additionally, you can use the get_projects_from_resource_manager() function to list all projects in the organization. This is useful for testing and manual execution, but it is not recommended for production use, as it will require the function to have the 'resourcemanager.projects.list' permission, which is a very broad permission.

	This function is not currently used in the main logic flow and merely exists outside and prints the list of projects in the organization that it has access to with it's current permissions.



	"""


	@functions_framework.http
	def bigquery_dataset_review_update(request):
	"""
	Updates the billing model of datasets in BigQuery based on a CSV file.

	Args:
	request (flask.Request): The HTTP request object.

	Returns:
	str: The result message indicating the completion status of the update.
	"""
	request_json = request.get_json(silent=True)
	request_args = request.args
	print(f"Request JSON: {request_json}")
	print(f"Request args: {request_args}")

	project_list_bucket = request.args.get('project_list_bucket')
	project_list_blob = request.args.get('project_list_blob')

	if not project_list_bucket or not project_list_blob:
	return 'Missing required parameters: project_list_bucket and project_list_blob', 400

	# Initialize clients once
	storage_client = storage.Client()
	bigquery_client = bigquery.Client()

	# Step one: Download CSV of projects from bucket
	try:
	destination_file_name = './projects.csv'
	download_csv_from_bucket(storage_client, project_list_bucket, project_list_blob, destination_file_name)
	except Exception as e:
	return f'Failed to download CSV: {e}', 500

	# Step two: Read CSV and get list of projects
	try:
	projects = read_csv_to_list(destination_file_name)
	except Exception as e:
	return f'Failed to read projects CSV: {e}', 500

	# Step three: Iterate over projects and list datasets
	for project in projects:
	try:
	project_id = project['project_id'] # Adjust based on your CSV columns
	datasets = list_datasets_in_project(bigquery_client, project_id)
	for dataset in datasets:
	dataset_id = dataset.dataset_id
	full_dataset_id = f"{project_id}.{dataset_id}"
	alter_dataset_storage_billing_model(bigquery_client, full_dataset_id, 'PHYSICAL')
	except Exception as e:
	# Log the error but continue processing other projects
	print(f"Error processing project {project_id}: {e}")

	return 'Completed updating datasets billing model.', 200

	def download_csv_from_bucket(storage_client, bucket_name, source_blob_name, destination_file_name):
	bucket = storage_client.bucket(bucket_name)
	blob = bucket.blob(source_blob_name)
	blob.download_to_filename(destination_file_name)
	print(f"Blob {source_blob_name} downloaded to {destination_file_name}.")

	def read_csv_to_list(file_path):
	with open(file_path, mode='r', encoding='utf-8') as file:
	csv_reader = csv.DictReader(file)
	return [row for row in csv_reader]

	def list_datasets_in_project(bigquery_client, project_id):
	datasets = list(bigquery_client.list_datasets(project=project_id))
	print(datasets)
	return datasets

	def alter_dataset_storage_billing_model(bigquery_client, dataset_name, billing_model):
	query = f"""
	ALTER SCHEMA `{dataset_name}`
	SET OPTIONS(
	storage_billing_model = '{billing_model}'
	);
	"""
	bigquery_client.query(query).result()
	print(f"Dataset '{dataset_name}' updated to use the '{billing_model}' billing model.")
	get_projects_from_resource_manager()

	# This function is not currently used in the main logic flow and merely exists outside and prints the list of projects in the organization that it has access to with it's current permissions.

	def get_projects_from_resource_manager():
	client = resourcemanager.ProjectsClient()
	# List all projects you have access to
	for project in client.list_projects():
	print(project)