Last active
June 2, 2023 20:53
-
-
Save timhberry/a6dca3516ed2dce84c2238b16556d889 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from google.cloud import bigquery | |
client = bigquery.Client() | |
# returns a Query Job object | |
# https://googleapis.dev/python/bigquery/latest/generated/google.cloud.bigquery.job.QueryJob.html#google.cloud.bigquery.job.QueryJob | |
query = client.query( | |
""" | |
SELECT * FROM ... | |
""" | |
) | |
results = query.result() # Make the API request | |
for row in results: | |
# Row values can be accessed by field name or index. | |
print("Name: {} Address: {}".format(row["name"], row["address"])) | |
input("Next demonstration please...") | |
# most common words in shakespeare | |
query = """ | |
SELECT corpus AS title, COUNT(word) AS unique_words | |
FROM `bigquery-public-data.samples.shakespeare` | |
GROUP BY title | |
ORDER BY unique_words | |
DESC LIMIT 10 | |
""" | |
# subtly different, using iterator of queryjob object, not results | |
results = client.query(query) | |
for row in results: | |
title = row['title'] | |
unique_words = row['unique_words'] | |
print(f'{title:<20} | {unique_words}') | |
input("Next demonstration please...") | |
# find the most common commit messages on GitHub | |
query = """ | |
SELECT subject AS subject, COUNT(*) AS num_duplicates | |
FROM bigquery-public-data.github_repos.commits | |
GROUP BY subject | |
ORDER BY num_duplicates | |
DESC LIMIT 10 | |
""" | |
results = client.query(query) | |
# to add caching - comment out above line and add these: | |
# | |
# job_config = bigquery.job.QueryJobConfig(use_query_cache=False) | |
# results = client.query(query, job_config=job_config) | |
for row in results: | |
subject = row['subject'] | |
num_duplicates = row['num_duplicates'] | |
print(f'{subject:<20} | {num_duplicates:>9,}') | |
input("Next demonstration please...") | |
# loading DATA JSON | |
filename = 'localfile.json' | |
dataset = bigquery.Dataset('yourproject.yourdataset') | |
table = dataset.table('yourtable') | |
# create LoadJobConfig object and then alter it afterwards | |
job_config = bigquery.job.LoadJobConfig() | |
job_config.schema = [ | |
bigquery.SchemaField('name', 'STRING'), | |
bigquery.SchemaField('address', 'STRING'), | |
] | |
job_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON | |
with open(filename, "rb") as source_file: | |
job = client.load_table_from_file( | |
source_file, | |
table, | |
location="europe-north1", # Must match the destination dataset location | |
job_config=job_config, | |
) # API request | |
job.result() # Waits for table load to complete. | |
print('JSON file loaded to BigQuery') | |
# If still time, look at code samples on | |
# https://cloud.google.com/bigquery/docs/samples | |
view_id = "yourproject.yourdataset.yourview" | |
source_id = "yourproject.yourdataset.yourtable" | |
view = bigquery.Table(view_id) | |
view.view_query = """ | |
SELECT * FROM ... | |
) | |
""" | |
view = client.create_table(view) | |
print(f"Created {view.table_type}: {str(view.reference)}") | |
input("Next demonstration please...") | |
# loading data CSV | |
filename = 'yourfile.csv' | |
dataset = bigquery.Dataset('yourproject.yourdataset') | |
table = dataset.table('yourtable') | |
# create jobconfig object and prepopulate | |
job_config = bigquery.job.LoadJobConfig( | |
source_format=bigquery.SourceFormat.CSV, skip_leading_rows=1, autodetect=True, | |
write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE, | |
max_bad_records=1000 | |
) | |
# WRITE_TRUNCATE: If the table already exists, BigQuery overwrites the table data and uses the schema from the load. | |
# WRITE_APPEND: If the table already exists, BigQuery appends the data to the table. | |
# WRITE_EMPTY: If the table already exists and contains data, a 'duplicate' error is returned in the job result. | |
# https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.write_disposition | |
with open(filename, "rb") as source_file: | |
job = client.load_table_from_file( | |
source_file, | |
table, | |
location="europe-north1", # Must match the destination dataset location. | |
job_config=job_config, | |
) # API request | |
job.result() # Waits for table load to complete. | |
print('CSV file loaded to BigQuery') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment