Created
January 15, 2021 06:55
-
-
Save sanikamal/e589f235673327ff0053fe478a1a5611 to your computer and use it in GitHub Desktop.
Create ML Models with BigQuery ML
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Task 1: Create a dataset to store your machine learning models | |
### dataset name is up to you | |
cloudshell: bq mk austin | |
or | |
create the dataset from bigquery console | |
## Task 2: Create a forecasting BigQuery machine learning model. | |
Write the below query in BigQuery console: | |
### model name is up to you | |
CREATE OR REPLACE MODEL austin.aust_model_1 | |
OPTIONS(input_label_cols=['duration_minutes'], model_type='linear_reg') | |
AS SELECT duration_minutes, location, start_station_name, | |
CAST(EXTRACT(dayofweek FROM start_time) AS STRING) as dayofweek, | |
CAST(EXTRACT(hour FROM start_time) AS STRING) AS hourofday, | |
FROM | |
`bigquery-public-data.austin_bikeshare.bikeshare_trips` AS trips | |
JOIN | |
`bigquery-public-data.austin_bikeshare.bikeshare_stations` AS stations | |
ON trips.start_station_id = stations.station_id | |
WHERE | |
EXTRACT(year from start_time) = 2018 | |
Write the below command in cloud shell : | |
export PROJECT_ID=DEVSHELL_PROJECT_ID | |
bq query --use_legacy_sql=false "CREATE OR REPLACE MODEL austin.aust_model_1 OPTIONS(input_label_cols=['duration_minutes'], model_type='linear_reg') | |
AS SELECT duration_minutes, location, start_station_name, CAST(EXTRACT(dayofweek FROM start_time) AS STRING) as dayofweek, CAST(EXTRACT(hour FROM start_time) AS STRING) AS hourofday, | |
FROM \`bigquery-public-data.austin_bikeshare.bikeshare_trips\` AS trips JOIN \`bigquery-public-data.austin_bikeshare.bikeshare_stations\` AS stations ON trips.start_station_id = stations.station_id WHERE EXTRACT(year from start_time) = 2018" | |
## Task 3: Create the second machine learning model. | |
### model name is up to you | |
Write the below query in BigQuery console: | |
CREATE OR REPLACE MODEL austin.aust_model_2 | |
OPTIONS(input_label_cols=['duration_minutes'], model_type='linear_reg') | |
AS SELECT duration_minutes, subscriber_type, start_station_name, | |
CAST(EXTRACT(hour FROM start_time) AS STRING) AS hourofday | |
FROM `bigquery-public-data.austin_bikeshare.bikeshare_trips` AS trips | |
WHERE EXTRACT(year from start_time) = 2018 | |
Write the below command in cloud shell : | |
bq query --use_legacy_sql=false "CREATE OR REPLACE MODEL austin.aust_model_2 OPTIONS(input_label_cols=['duration_minutes'], model_type='linear_reg') | |
AS SELECT duration_minutes, subscriber_type, start_station_name, CAST(EXTRACT(hour FROM start_time) AS STRING) AS hourofday | |
FROM \`bigquery-public-data.austin_bikeshare.bikeshare_trips\` AS trips WHERE EXTRACT(year from start_time) = 2018" | |
## Task 4: Evaluate the two machine learning models. | |
Write the below query in BigQuery console: | |
SELECT | |
SQRT(mean_squared_error) as rmse, mean_absolute_error | |
FROM | |
ML.EVALUATE(MODEL austin.aust_model_1) ; | |
SELECT | |
SQRT(mean_squared_error) as rmse, mean_absolute_error | |
FROM | |
ML.EVALUATE(MODEL austin.aust_model_2); | |
SELECT | |
SQRT(mean_squared_error)AS rmse, mean_absolute_error | |
FROM | |
ML.EVALUATE (MODEL austin.aust_model_1, ( | |
SELECT | |
duration_minutes, location, start_station_name, | |
CAST(EXTRACT(dayofweek FROM start_time) AS STRING) as dayofweek, | |
CAST(EXTRACT(hour FROM start_time) AS STRING) AS hourofday, | |
FROM | |
`bigquery-public-data.austin_bikeshare.bikeshare_trips` AS trips | |
JOIN | |
`bigquery-public-data.austin_bikeshare.bikeshare_stations` AS stations | |
ON trips.start_station_id = stations.station_id | |
WHERE | |
EXTRACT(year from start_time) = 2019 | |
)); | |
SELECT | |
SQRT(mean_squared_error)AS rmse, mean_absolute_error | |
FROM | |
ML.EVALUATE(MODEL austin.aust_model_2, ( | |
SELECT | |
duration_minutes, subscriber_type, start_station_name, | |
CAST(EXTRACT(hour FROM start_time) AS STRING) AS hourofday | |
FROM | |
`bigquery-public-data.austin_bikeshare.bikeshare_trips` AS trips | |
WHERE | |
EXTRACT(year from start_time) = 2019 | |
)) | |
Write the below command in cloud shell : | |
bq query --use_legacy_sql=false "SELECT SQRT(mean_squared_error)AS rmse, mean_absolute_error FROM ML.EVALUATE(MODEL austin.aust_model_1, (SELECT duration_minutes, location, start_station_name, CAST(EXTRACT(dayofweek FROM start_time) AS STRING) as dayofweek, CAST(EXTRACT(hour FROM start_time) AS STRING) AS hourofday, FROM \`bigquery-public-data.austin_bikeshare.bikeshare_trips\` AS trips JOIN \`bigquery-public-data.austin_bikeshare.bikeshare_stations\` AS stations ON trips.start_station_id = stations.station_id WHERE EXTRACT(year from start_time) = 2019))" | |
bq query --use_legacy_sql=false "SELECT SQRT(mean_squared_error)AS rmse, mean_absolute_error FROM ML.EVALUATE(MODEL austin.aust_model_2, (SELECT duration_minutes, subscriber_type, start_station_name, CAST(EXTRACT(hour FROM start_time) AS STRING) AS hourofday FROM \`bigquery-public-data.austin_bikeshare.bikeshare_trips\` AS trips WHERE EXTRACT(year from start_time) = 2019))" | |
## Task 5: Use the subscriber type machine learning model to predict average trip durations | |
Write the below query in BigQuery console: | |
SELECT start_station_name, avg(duration_minutes) as avg_duration, count(*) as total_trips | |
FROM | |
`bigquery-public-data.austin_bikeshare.bikeshare_trips` | |
WHERE | |
EXTRACT(year FROM start_time) = 2019 | |
GROUP BY start_station_name | |
ORDER BY total_trips DESC | |
Write the below command in cloud shell : | |
bq query --use_legacy_sql=false "SELECT start_station_name, avg(duration_minutes) as avg_duration, count(*) as total_trips FROM \`bigquery-public-data.austin_bikeshare.bikeshare_trips\` WHERE EXTRACT(year FROM start_time) = 2019 GROUP BY start_station_name ORDER BY total_trips DESC" | |
in BigQuery Editor write this command : | |
SELECT | |
avg(predicted_duration_minutes),count(duration_minutes) | |
FROM | |
Ml.PREDICT(MODEL austin.aust_model_2, ( | |
SELECT duration_minutes, subscriber_type, start_station_name, | |
CAST(EXTRACT(hour FROM start_time) AS STRING) AS hourofday | |
FROM | |
`bigquery-public-data.austin_bikeshare.bikeshare_trips` AS trips | |
WHERE | |
EXTRACT(year from start_time) = 2019 | |
and start_station_name = '21st & Speedway @PCL' | |
and subscriber_type='Single Trip' | |
)) | |
in Cloud SHell write this command : | |
bq query --use_legacy_sql=false "SELECT avg(predicted_duration_minutes),count(duration_minutes) FROM Ml.PREDICT(MODEL austin.aust_model_2, (SELECT duration_minutes, subscriber_type, start_station_name, CAST(EXTRACT(hour FROM start_time) AS STRING) AS hourofday FROM \`bigquery-public-data.austin_bikeshare.bikeshare_trips\` AS trips WHERE EXTRACT(year from start_time)=2019 and start_station_name = '21st & Speedway @PCL' and subscriber_type='Single Trip' ))" | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment