Skip to content

Instantly share code, notes, and snippets.

@timhberry
Last active June 2, 2023 20:53
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save timhberry/a6dca3516ed2dce84c2238b16556d889 to your computer and use it in GitHub Desktop.
Save timhberry/a6dca3516ed2dce84c2238b16556d889 to your computer and use it in GitHub Desktop.
from google.cloud import bigquery
client = bigquery.Client()
# returns a Query Job object
# https://googleapis.dev/python/bigquery/latest/generated/google.cloud.bigquery.job.QueryJob.html#google.cloud.bigquery.job.QueryJob
query = client.query(
"""
SELECT * FROM ...
"""
)
results = query.result() # Make the API request
for row in results:
# Row values can be accessed by field name or index.
print("Name: {} Address: {}".format(row["name"], row["address"]))
input("Next demonstration please...")
# most common words in shakespeare
query = """
SELECT corpus AS title, COUNT(word) AS unique_words
FROM `bigquery-public-data.samples.shakespeare`
GROUP BY title
ORDER BY unique_words
DESC LIMIT 10
"""
# subtly different, using iterator of queryjob object, not results
results = client.query(query)
for row in results:
title = row['title']
unique_words = row['unique_words']
print(f'{title:<20} | {unique_words}')
input("Next demonstration please...")
# find the most common commit messages on GitHub
query = """
SELECT subject AS subject, COUNT(*) AS num_duplicates
FROM bigquery-public-data.github_repos.commits
GROUP BY subject
ORDER BY num_duplicates
DESC LIMIT 10
"""
results = client.query(query)
# to add caching - comment out above line and add these:
#
# job_config = bigquery.job.QueryJobConfig(use_query_cache=False)
# results = client.query(query, job_config=job_config)
for row in results:
subject = row['subject']
num_duplicates = row['num_duplicates']
print(f'{subject:<20} | {num_duplicates:>9,}')
input("Next demonstration please...")
# loading DATA JSON
filename = 'localfile.json'
dataset = bigquery.Dataset('yourproject.yourdataset')
table = dataset.table('yourtable')
# create LoadJobConfig object and then alter it afterwards
job_config = bigquery.job.LoadJobConfig()
job_config.schema = [
bigquery.SchemaField('name', 'STRING'),
bigquery.SchemaField('address', 'STRING'),
]
job_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON
with open(filename, "rb") as source_file:
job = client.load_table_from_file(
source_file,
table,
location="europe-north1", # Must match the destination dataset location
job_config=job_config,
) # API request
job.result() # Waits for table load to complete.
print('JSON file loaded to BigQuery')
# If still time, look at code samples on
# https://cloud.google.com/bigquery/docs/samples
view_id = "yourproject.yourdataset.yourview"
source_id = "yourproject.yourdataset.yourtable"
view = bigquery.Table(view_id)
view.view_query = """
SELECT * FROM ...
)
"""
view = client.create_table(view)
print(f"Created {view.table_type}: {str(view.reference)}")
input("Next demonstration please...")
# loading data CSV
filename = 'yourfile.csv'
dataset = bigquery.Dataset('yourproject.yourdataset')
table = dataset.table('yourtable')
# create jobconfig object and prepopulate
job_config = bigquery.job.LoadJobConfig(
source_format=bigquery.SourceFormat.CSV, skip_leading_rows=1, autodetect=True,
write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,
max_bad_records=1000
)
# WRITE_TRUNCATE: If the table already exists, BigQuery overwrites the table data and uses the schema from the load.
# WRITE_APPEND: If the table already exists, BigQuery appends the data to the table.
# WRITE_EMPTY: If the table already exists and contains data, a 'duplicate' error is returned in the job result.
# https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.write_disposition
with open(filename, "rb") as source_file:
job = client.load_table_from_file(
source_file,
table,
location="europe-north1", # Must match the destination dataset location.
job_config=job_config,
) # API request
job.result() # Waits for table load to complete.
print('CSV file loaded to BigQuery')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment