timhberry/bigquery_snippets.py

## bigquery_snippets.py
from google.cloud import bigquery

client = bigquery.Client()

# returns a Query Job object
# https://googleapis.dev/python/bigquery/latest/generated/google.cloud.bigquery.job.QueryJob.html#google.cloud.bigquery.job.QueryJob

query = client.query(
    """
    SELECT * FROM ...
    """
)

results = query.result() # Make the API request

for row in results:
    # Row values can be accessed by field name or index.
    print("Name: {} Address: {}".format(row["name"], row["address"]))

input("Next demonstration please...")

# most common words in shakespeare

query = """
    SELECT corpus AS title, COUNT(word) AS unique_words
    FROM `bigquery-public-data.samples.shakespeare`
    GROUP BY title
    ORDER BY unique_words
    DESC LIMIT 10
"""

# subtly different, using iterator of queryjob object, not results
results = client.query(query)

for row in results:
    title = row['title']
    unique_words = row['unique_words']
    print(f'{title:<20} | {unique_words}')

input("Next demonstration please...")

# find the most common commit messages on GitHub

query = """
    SELECT subject AS subject, COUNT(*) AS num_duplicates
    FROM bigquery-public-data.github_repos.commits
    GROUP BY subject
    ORDER BY num_duplicates
    DESC LIMIT 10
"""
results = client.query(query)

# to add caching - comment out above line and add these:
#
# job_config = bigquery.job.QueryJobConfig(use_query_cache=False)
# results = client.query(query, job_config=job_config)

for row in results:
    subject = row['subject']
    num_duplicates = row['num_duplicates']
    print(f'{subject:<20} | {num_duplicates:>9,}')

input("Next demonstration please...")

# loading DATA JSON

filename = 'localfile.json'
dataset = bigquery.Dataset('yourproject.yourdataset')
table = dataset.table('yourtable')

# create LoadJobConfig object and then alter it afterwards
job_config = bigquery.job.LoadJobConfig()
job_config.schema = [
    bigquery.SchemaField('name', 'STRING'),
    bigquery.SchemaField('address', 'STRING'),
]
job_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON

with open(filename, "rb") as source_file:
    job = client.load_table_from_file(
        source_file,
        table,
        location="europe-north1",  # Must match the destination dataset location
        job_config=job_config,
    )  # API request

job.result()  # Waits for table load to complete.

print('JSON file loaded to BigQuery')

# If still time, look at code samples on
# https://cloud.google.com/bigquery/docs/samples

view_id = "yourproject.yourdataset.yourview"
source_id = "yourproject.yourdataset.yourtable"

view = bigquery.Table(view_id)

view.view_query = """
SELECT * FROM ...
)
"""

view = client.create_table(view)
print(f"Created {view.table_type}: {str(view.reference)}")

input("Next demonstration please...")

# loading data CSV

filename = 'yourfile.csv'
dataset = bigquery.Dataset('yourproject.yourdataset')
table = dataset.table('yourtable')

# create jobconfig object and prepopulate
job_config = bigquery.job.LoadJobConfig(
    source_format=bigquery.SourceFormat.CSV, skip_leading_rows=1, autodetect=True,
    write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,
    max_bad_records=1000
)

# WRITE_TRUNCATE: If the table already exists, BigQuery overwrites the table data and uses the schema from the load.
# WRITE_APPEND: If the table already exists, BigQuery appends the data to the table.
# WRITE_EMPTY: If the table already exists and contains data, a 'duplicate' error is returned in the job result.
# https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.write_disposition


with open(filename, "rb") as source_file:
    job = client.load_table_from_file(
        source_file,
        table,
        location="europe-north1",  # Must match the destination dataset location.
        job_config=job_config,
    )  # API request

job.result()  # Waits for table load to complete.

print('CSV file loaded to BigQuery')
	from google.cloud import bigquery

	client = bigquery.Client()

	# returns a Query Job object
	# https://googleapis.dev/python/bigquery/latest/generated/google.cloud.bigquery.job.QueryJob.html#google.cloud.bigquery.job.QueryJob

	query = client.query(
	"""
	SELECT * FROM ...
	"""
	)

	results = query.result() # Make the API request

	for row in results:
	# Row values can be accessed by field name or index.
	print("Name: {} Address: {}".format(row["name"], row["address"]))

	input("Next demonstration please...")

	# most common words in shakespeare

	query = """
	SELECT corpus AS title, COUNT(word) AS unique_words
	FROM `bigquery-public-data.samples.shakespeare`
	GROUP BY title
	ORDER BY unique_words
	DESC LIMIT 10
	"""

	# subtly different, using iterator of queryjob object, not results
	results = client.query(query)

	for row in results:
	title = row['title']
	unique_words = row['unique_words']
	print(f'{title:<20} \| {unique_words}')

	input("Next demonstration please...")

	# find the most common commit messages on GitHub

	query = """
	SELECT subject AS subject, COUNT(*) AS num_duplicates
	FROM bigquery-public-data.github_repos.commits
	GROUP BY subject
	ORDER BY num_duplicates
	DESC LIMIT 10
	"""
	results = client.query(query)

	# to add caching - comment out above line and add these:
	#
	# job_config = bigquery.job.QueryJobConfig(use_query_cache=False)
	# results = client.query(query, job_config=job_config)

	for row in results:
	subject = row['subject']
	num_duplicates = row['num_duplicates']
	print(f'{subject:<20} \| {num_duplicates:>9,}')

	input("Next demonstration please...")

	# loading DATA JSON

	filename = 'localfile.json'
	dataset = bigquery.Dataset('yourproject.yourdataset')
	table = dataset.table('yourtable')

	# create LoadJobConfig object and then alter it afterwards
	job_config = bigquery.job.LoadJobConfig()
	job_config.schema = [
	bigquery.SchemaField('name', 'STRING'),
	bigquery.SchemaField('address', 'STRING'),
	]
	job_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON

	with open(filename, "rb") as source_file:
	job = client.load_table_from_file(
	source_file,
	table,
	location="europe-north1", # Must match the destination dataset location
	job_config=job_config,
	) # API request

	job.result() # Waits for table load to complete.

	print('JSON file loaded to BigQuery')

	# If still time, look at code samples on
	# https://cloud.google.com/bigquery/docs/samples

	view_id = "yourproject.yourdataset.yourview"
	source_id = "yourproject.yourdataset.yourtable"

	view = bigquery.Table(view_id)

	view.view_query = """
	SELECT * FROM ...
	)
	"""

	view = client.create_table(view)
	print(f"Created {view.table_type}: {str(view.reference)}")

	input("Next demonstration please...")

	# loading data CSV

	filename = 'yourfile.csv'
	dataset = bigquery.Dataset('yourproject.yourdataset')
	table = dataset.table('yourtable')

	# create jobconfig object and prepopulate
	job_config = bigquery.job.LoadJobConfig(
	source_format=bigquery.SourceFormat.CSV, skip_leading_rows=1, autodetect=True,
	write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,
	max_bad_records=1000
	)

	# WRITE_TRUNCATE: If the table already exists, BigQuery overwrites the table data and uses the schema from the load.
	# WRITE_APPEND: If the table already exists, BigQuery appends the data to the table.
	# WRITE_EMPTY: If the table already exists and contains data, a 'duplicate' error is returned in the job result.
	# https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.write_disposition


	with open(filename, "rb") as source_file:
	job = client.load_table_from_file(
	source_file,
	table,
	location="europe-north1", # Must match the destination dataset location.
	job_config=job_config,
	) # API request

	job.result() # Waits for table load to complete.

	print('CSV file loaded to BigQuery')