-
-
Save krisjan-oldekamp/78869851421af2f75325c32302fa2137 to your computer and use it in GitHub Desktop.
############################################################ | |
# Author Krisjan Oldekamp / Stacktonic.com | |
# Email krisjan@stacktonic.com | |
# Article https://stacktonic.com/article/backup-your-valuable-big-query-views-and-scheduled-queries-using-python | |
############################################################ | |
import os | |
import git | |
import google.oauth2.service_account | |
from google.cloud import bigquery | |
from google.cloud import bigquery_datatransfer # Also enable the Data Transfer API in the GCP console | |
############################################################ | |
# Settings | |
LOCAL_PATH_BACKUP = "./gcp-bigquery-sql-backup/" # Backup directory (end with a trailing /). When REPO_COMMIT is set to True, this folder will also be used to clone and commit to the Git repository | |
REPO_COMMIT = True # When set to True, the script will clone the Git repository specified in the REPO_LINK setting. Then it will delete all the folders in the repo, download the latest version of the views and scheduled queries to the repo and commits the changes. | |
REPO_LINK = "git@bitbucket.org..." # Repository link | |
REPO_KEY_PATH = "../.ssh/..." # Path to SSH private key used for authentication with the repository. Leave empty when not using an SSH key. | |
GCP_JSON_KEYPATH = "your-keyfile.json" # Path to JSON keyfile for Service Account | |
# You can specify multiple projects. The location / region is required for downloading scheduled query SQL. | |
GCP_PROJECTS_BACKUP = [ | |
{ | |
"project_id": "your-gcp-project-id", | |
"location": "europe" # See the "region" column in the GCP scheduled query interface | |
} | |
] | |
# End of settings | |
############################################################ | |
# Authenticate use a Service account / JSON keyfile | |
def get_credentials(keyfile_json): | |
scopes=['https://www.googleapis.com/auth/bigquery'] | |
return ( | |
google.oauth2.service_account.Credentials.from_service_account_file(keyfile_json, scopes=scopes) | |
) | |
# Format scheduled query name so it can be used in a filename | |
def format_name(name): | |
return name.lower().replace(" ", "_") | |
# Push commit to repository | |
def git_push_commit(repo, repo_key_path, repo_local_path): | |
try: | |
if repo_key_path == "": | |
repo.git.add(all=True) | |
repo.index.commit("Automated commit containing changes to BigQuery views and scheduled queries.") | |
repo.remotes.origin.push() | |
else: | |
with git.Git().custom_environment(GIT_SSH_COMMAND="ssh -i " + repo_key_path): | |
repo.git.add(all=True) | |
repo.index.commit("Automated commit containing changes to BigQuery views and scheduled queries.") | |
repo.remotes.origin.push() | |
except: | |
print("Some error occured while pushing the commit") | |
# Save Google BigQuery views to local filesystem | |
def save_bigquery_views(credentials, project_id, path): | |
client_bq = bigquery.Client(credentials=credentials, project=project_id) | |
datasets = list(client_bq.list_datasets()) | |
cnt_views = 0 | |
# Loop datasets | |
if datasets: | |
for dataset in datasets: | |
dataset_name = dataset.dataset_id | |
dataset_ref = dataset.reference | |
tables = list(client_bq.list_tables(dataset_ref)) | |
# Loop tables in dataset | |
for table in tables: | |
# Only select VIEW | |
if table.table_type == "VIEW": | |
table_name = table.table_id | |
table_ref = dataset_ref.table(table_name) | |
table = client_bq.get_table(table_ref) | |
backup_directory = path + project_id + "/bq_views/" + dataset_name | |
if not os.path.exists(backup_directory): | |
os.makedirs(backup_directory) # Create directory when it doesn't exist | |
# Save view SQL to file | |
f = open(backup_directory + "/" + table_name + ".sql", "w+") # Save view SQL to file | |
f.write(table.view_query) | |
cnt_views+=1 | |
return cnt_views | |
# Save Google BigQuery scheduled queries to local filesystem | |
def save_bigquery_scheduled_queries(credentials, project_id, location, path): | |
client_bq_df = bigquery_datatransfer.DataTransferServiceClient(credentials=credentials) | |
parent = "projects/{}/locations/{}".format(project_id, location) | |
cnt_scheduled_queries = 0 | |
# Loop all Data Transfer elements in project | |
for element in client_bq_df.list_transfer_configs(parent): | |
scheduled_query_name = format_name(element.display_name) | |
scheduled_query_sql = "" | |
# Loop over Data Transfer parameters, select scheduled queries and get query SQL | |
params = element.params.items() | |
for key, value in params: | |
if key == "query": | |
scheduled_query_sql = value | |
backup_directory = path + project_id + "/bq_scheduled_queries/" | |
if not os.path.exists(backup_directory): # Create directory when it doesn't exist | |
os.makedirs(backup_directory) | |
# Write to file | |
f = open(backup_directory + "/" + scheduled_query_name + ".sql", "w+") | |
f.write(scheduled_query_sql) | |
cnt_scheduled_queries+=1 | |
return cnt_scheduled_queries | |
def execute(): | |
credentials = get_credentials(GCP_JSON_KEYPATH) | |
print ("Found {} GCP projects in settings".format(len(GCP_PROJECTS_BACKUP))) | |
if REPO_COMMIT: | |
# Clone repository if Git commits are enabled | |
print("Git commits enabled. Cloning repository {} to {}".format(REPO_LINK, LOCAL_PATH_BACKUP)) | |
# Delete folder / repository when existing | |
if os.path.exists(LOCAL_PATH_BACKUP): | |
git.rmtree(LOCAL_PATH_BACKUP) | |
if REPO_KEY_PATH == "": | |
repo = git.Repo.clone_from(REPO_LINK, LOCAL_PATH_BACKUP) | |
else: | |
repo = git.Repo.clone_from(REPO_LINK, LOCAL_PATH_BACKUP, env={"GIT_SSH_COMMAND": "ssh -i " + REPO_KEY_PATH}) # Use SSH key | |
# Remove old backups in repository | |
repo_dirs = next(os.walk(LOCAL_PATH_BACKUP))[1] | |
for dir in repo_dirs: | |
if not dir.startswith("."): | |
git.rmtree(os.path.join(LOCAL_PATH_BACKUP, dir)) | |
else: | |
# Only download to local filesystem. Create directory when it doesn't exist | |
if not os.path.exists(LOCAL_PATH_BACKUP): | |
os.makedirs(LOCAL_PATH_BACKUP) | |
# Loop through GCP project and save views and scheduled queries | |
for i, project in enumerate(GCP_PROJECTS_BACKUP): | |
print ("-- Starting backup for project: {}".format(project["project_id"])) | |
views = save_bigquery_views( | |
credentials, | |
project["project_id"], | |
LOCAL_PATH_BACKUP | |
) | |
print("# {} views saved..".format(views)) | |
scheduled_queries = save_bigquery_scheduled_queries( | |
credentials, project["project_id"], | |
project["location"], | |
LOCAL_PATH_BACKUP | |
) | |
print("# {} scheduled queries saved..".format(scheduled_queries)) | |
# Push code to remote repository | |
if REPO_COMMIT: | |
git_push_commit(repo, REPO_KEY_PATH, LOCAL_PATH_BACKUP) | |
print("Pushed code to repository..") | |
print("Done.") | |
execute() |
@richardbiddle You're completely right, updated the snippet.
Thanks @krisjan-oldekamp works perfectly
Hi. I get this error -- TypeError: Invalid constructor input for ListTransferConfigsRequest: 'projects/myproject/locations/us-central1'.
I know that I am entering the correct inputs for project/location. I can transfer Views, but get the error for scheduled queries.
@normwarren I came across the same error message. I found the solution in stackoverflow: https://stackoverflow.com/questions/67671525/bigquery-datatransfer-invalid-constructor-input-for-startmanualtransferrunsreq.
It seems that client_bq_df.list_transfer_configs(parent)
is expecting infromation about the project and location in the format of a dictionary instead of a string.
Following up on @miguel-graindata's comment: replace line 99 with:
parent = dict(parent= "projects/{}/locations/{}".format(project_id, location))
...and it works
(edited to write a better fix than I had)
Have gone a step further - my forked gist for this backs up stored procedures as well, if that's useful to anyone!
Looks to be missing f.write(table.view_query) in save_bigquery_views https://gist.github.com/krisjan-oldekamp/78869851421af2f75325c32302fa2137#file-google_bigquery_backup_views_scheduled_queries_git-py-L90