frankie-yanfeng/gcp_bucket_creation.py

## gcp_bucket_creation.py
# change these to try this notebook out
BUCKET = 'cloud-training-demos-ml'
PROJECT = 'cloud-training-demos'
REGION = 'us-central1'

import os
os.environ['BUCKET'] = BUCKET
os.environ['PROJECT'] = PROJECT
os.environ['REGION'] = REGION

%%bash
if ! gsutil ls | grep -q gs://${BUCKET}/; then
  gsutil mb -l ${REGION} gs://${BUCKET}
fi

# Create SQL query using natality data after the year 2000
query = """
SELECT
  weight_pounds,
  is_male,
  mother_age,
  plurality,
  gestation_weeks,
  ABS(FARM_FINGERPRINT(CONCAT(CAST(YEAR AS STRING), CAST(month AS STRING)))) AS hashmonth
FROM
  publicdata.samples.natality
WHERE year > 2000
"""

# Call BigQuery and examine in dataframe
import google.datalab.bigquery as bq
df = bq.Query(query + " LIMIT 100").execute().result().to_dataframe()
df.head()

# Create function that finds the number of records and the average weight for each value of the chosen column
def get_distinct_values(column_name):
  sql = """
SELECT
  {0},
  COUNT(1) AS num_babies,
  AVG(weight_pounds) AS avg_wt
FROM
  publicdata.samples.natality
WHERE
  year > 2000
GROUP BY
  {0}
  """.format(column_name)
  return bq.Query(sql).execute().result().to_dataframe()

# Bar plot to see is_male with avg_wt linear and num_babies logarithmic
df = get_distinct_values('is_male')
df.plot(x='is_male', y='num_babies', kind='bar');
df.plot(x='is_male', y='avg_wt', kind='bar');

# Line plots to see mother_age with avg_wt linear and num_babies logarithmic
df = get_distinct_values('mother_age')
df = df.sort_values('mother_age')
df.plot(x='mother_age', y='num_babies');
df.plot(x='mother_age', y='avg_wt');

# Bar plot to see plurality(singleton, twins, etc.) with avg_wt linear and num_babies logarithmic
df = get_distinct_values('plurality')
df = df.sort_values('plurality')
df.plot(x='plurality', y='num_babies', logy=True, kind='bar');
df.plot(x='plurality', y='avg_wt', kind='bar');
	# change these to try this notebook out
	BUCKET = 'cloud-training-demos-ml'
	PROJECT = 'cloud-training-demos'
	REGION = 'us-central1'

	import os
	os.environ['BUCKET'] = BUCKET
	os.environ['PROJECT'] = PROJECT
	os.environ['REGION'] = REGION

	%%bash
	if ! gsutil ls \| grep -q gs://${BUCKET}/; then
	gsutil mb -l ${REGION} gs://${BUCKET}
	fi

	# Create SQL query using natality data after the year 2000
	query = """
	SELECT
	weight_pounds,
	is_male,
	mother_age,
	plurality,
	gestation_weeks,
	ABS(FARM_FINGERPRINT(CONCAT(CAST(YEAR AS STRING), CAST(month AS STRING)))) AS hashmonth
	FROM
	publicdata.samples.natality
	WHERE year > 2000
	"""

	# Call BigQuery and examine in dataframe
	import google.datalab.bigquery as bq
	df = bq.Query(query + " LIMIT 100").execute().result().to_dataframe()
	df.head()

	# Create function that finds the number of records and the average weight for each value of the chosen column
	def get_distinct_values(column_name):
	sql = """
	SELECT
	{0},
	COUNT(1) AS num_babies,
	AVG(weight_pounds) AS avg_wt
	FROM
	publicdata.samples.natality
	WHERE
	year > 2000
	GROUP BY
	{0}
	""".format(column_name)
	return bq.Query(sql).execute().result().to_dataframe()

	# Bar plot to see is_male with avg_wt linear and num_babies logarithmic
	df = get_distinct_values('is_male')
	df.plot(x='is_male', y='num_babies', kind='bar');
	df.plot(x='is_male', y='avg_wt', kind='bar');

	# Line plots to see mother_age with avg_wt linear and num_babies logarithmic
	df = get_distinct_values('mother_age')
	df = df.sort_values('mother_age')
	df.plot(x='mother_age', y='num_babies');
	df.plot(x='mother_age', y='avg_wt');

	# Bar plot to see plurality(singleton, twins, etc.) with avg_wt linear and num_babies logarithmic
	df = get_distinct_values('plurality')
	df = df.sort_values('plurality')
	df.plot(x='plurality', y='num_babies', logy=True, kind='bar');
	df.plot(x='plurality', y='avg_wt', kind='bar');