Created
April 17, 2019 10:08
-
-
Save frankie-yanfeng/fb4ce8ad0408d755422e30da02acc45d to your computer and use it in GitHub Desktop.
create the env variables to call gsutil command
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# change these to try this notebook out | |
BUCKET = 'cloud-training-demos-ml' | |
PROJECT = 'cloud-training-demos' | |
REGION = 'us-central1' | |
import os | |
os.environ['BUCKET'] = BUCKET | |
os.environ['PROJECT'] = PROJECT | |
os.environ['REGION'] = REGION | |
%%bash | |
if ! gsutil ls | grep -q gs://${BUCKET}/; then | |
gsutil mb -l ${REGION} gs://${BUCKET} | |
fi | |
# Create SQL query using natality data after the year 2000 | |
query = """ | |
SELECT | |
weight_pounds, | |
is_male, | |
mother_age, | |
plurality, | |
gestation_weeks, | |
ABS(FARM_FINGERPRINT(CONCAT(CAST(YEAR AS STRING), CAST(month AS STRING)))) AS hashmonth | |
FROM | |
publicdata.samples.natality | |
WHERE year > 2000 | |
""" | |
# Call BigQuery and examine in dataframe | |
import google.datalab.bigquery as bq | |
df = bq.Query(query + " LIMIT 100").execute().result().to_dataframe() | |
df.head() | |
# Create function that finds the number of records and the average weight for each value of the chosen column | |
def get_distinct_values(column_name): | |
sql = """ | |
SELECT | |
{0}, | |
COUNT(1) AS num_babies, | |
AVG(weight_pounds) AS avg_wt | |
FROM | |
publicdata.samples.natality | |
WHERE | |
year > 2000 | |
GROUP BY | |
{0} | |
""".format(column_name) | |
return bq.Query(sql).execute().result().to_dataframe() | |
# Bar plot to see is_male with avg_wt linear and num_babies logarithmic | |
df = get_distinct_values('is_male') | |
df.plot(x='is_male', y='num_babies', kind='bar'); | |
df.plot(x='is_male', y='avg_wt', kind='bar'); | |
# Line plots to see mother_age with avg_wt linear and num_babies logarithmic | |
df = get_distinct_values('mother_age') | |
df = df.sort_values('mother_age') | |
df.plot(x='mother_age', y='num_babies'); | |
df.plot(x='mother_age', y='avg_wt'); | |
# Bar plot to see plurality(singleton, twins, etc.) with avg_wt linear and num_babies logarithmic | |
df = get_distinct_values('plurality') | |
df = df.sort_values('plurality') | |
df.plot(x='plurality', y='num_babies', logy=True, kind='bar'); | |
df.plot(x='plurality', y='avg_wt', kind='bar'); | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment