Skip to content

Instantly share code, notes, and snippets.

@Jong-Sig
Created June 16, 2024 20:32
Show Gist options
  • Select an option

  • Save Jong-Sig/907c70d24ee439f7a16cc03c9cd2e146 to your computer and use it in GitHub Desktop.

Select an option

Save Jong-Sig/907c70d24ee439f7a16cc03c9cd2e146 to your computer and use it in GitHub Desktop.
Export Variables to Google Cloud Storage
# @title 1-2. Export GitHub-Full Samples to GCS
# Basic configs
from google.cloud import bigquery, storage
client = bigquery.Client()
storage_client = storage.Client()
bucket_name = 'cloud-github'
bucket_folder = 'github-full'
project = 'github-416320'
dataset_id = 'github_full'
table_id_prefix = 'github_sample_data_'
table_id_suffix = ['PR_event',
'PR_review_comment_event',
'PR_review_event',
'commit_comment_event',
'commit_event',
'create_event',
'delete_event',
'fork_event',
'gollum_event',
'issue_comment_event',
'issue_event',
'member_event',
'public_event',
'push_event',
'release_event',
'watch_event']
## Run results here ##
for suffix in table_id_suffix:
# Assign the subfolder uri using suffix
destination_uri = f'gs://{bucket_name}/{bucket_folder}/{suffix}/{table_id_prefix}{suffix}_*.parquet.gzip'
# Source references
dataset_ref = client.dataset(dataset_id, project = project)
table_ref = dataset_ref.table(table_id_prefix + suffix)
# Configs: parquet.gzip
configuration = bigquery.job.ExtractJobConfig()
configuration.compression = bigquery.Compression.GZIP
configuration.destination_format = bigquery.DestinationFormat.PARQUET
# Extract table
extract_job = client.extract_table(
table_ref,
destination_uri,
job_config = configuration,
location = 'US')
# API request
extract_job.result()
print(f'{suffix} extract completed')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment