Gad Benram gidutz

## predict_bill.sql
SELECT
  year,
  month,
  day,
  cumsum,
  ROUND(predicted_label) predicted,
  monthly_cost actual_month_cost,
  ROUND(100 * (ABS(predicted_label - monthly_cost) /  monthly_cost),1) abs_err
FROM
  ML.PREDICT(MODEL `billing_dataset_example.model_linear_regression`,

## fit_model.sql
CREATE OR REPLACE MODEL
  billing_dataset_example.model_linear_regression --model save path
OPTIONS
  ( model_type='linear_reg', -- As of Aug 2018 you can choose between linear regression and logistic regression
    ls_init_learn_rate=.015,
    l1_reg=0.1,
    l2_reg=0.1,
    data_split_method='seq',
    data_split_col='split_col',


## create_dataset.sql
WITH
  cumsum_table AS(
  SELECT
    account_name, year, month, day,
    ROUND(SUM(daily_cost) OVER (PARTITION BY account_name, year, month ORDER BY day )) AS cumsum,
    ROUND(AVG(daily_cost) OVER (PARTITION BY account_name, year, month ORDER BY day )) AS mean_daily_cost
  FROM
    `billing_dataset_example.billing_daily_monthly`
  ORDER BY  account_name, year, month, day),
  monthly_cost_table AS (

## aggregate_to_daily.sql
-- https://stackoverflow.com/questions/41707583/get-total-no-of-days-in-given-month-in-google-bigquery
CREATE TEMP FUNCTION DaysInMonth(d TIMESTAMP) AS (
  32 - EXTRACT(DAY FROM DATE_ADD(DATE_TRUNC(CAST (d AS DATE), MONTH), INTERVAL 31 DAY))
);

WITH monthly_cost_table AS (SELECT
                      account_name,
                      EXTRACT (MONTH FROM bill_datetime) month,
                      EXTRACT (YEAR FROM bill_datetime) year,
                      SUM(cost) monthly_cost

## extract_features.sql
WITH dataset AS( SELECT

              EXTRACT(HOUR FROM  trip_start_timestamp) trip_start_hour
            , EXTRACT(DAYOFWEEK FROM  trip_start_timestamp) trip_start_weekday
            , EXTRACT(WEEK FROM  trip_start_timestamp) trip_start_week
            , EXTRACT(DAYOFYEAR FROM  trip_start_timestamp) trip_start_yearday
            , EXTRACT(MONTH FROM  trip_start_timestamp) trip_start_month
            , (trip_miles * 1.60934 ) / ((trip_seconds + .01) / (60 * 60)) trip_speed_kmph
            , trip_miles
            , pickup_latitude

## hyper_param_spec.json
{
        "scaleTier": "CUSTOM",
        "masterType": "standard_gpu",
        "args": [
            "--preprocess",
            "--validation_split=0.2",
            "--model_type=regression",
            "--hidden_units=120,60,60",
            "--batch_size=128",
            "--eval_frequency_secs=128",

## submit_train.py

def train_hyper_params(cloudml_client, training_inputs):

    job_name = 'chicago_travel_time_training_{}'.format(datetime.utcnow().strftime('%Y%m%d%H%M%S'))
    project_name = 'projects/{}'.format(project_id)
    job_spec = {'jobId': job_name, 'trainingInput': training_inputs}
    response = cloudml_client.projects().jobs().create(body=job_spec,
                                                parent=project_name).execute()
    print(response)

## deploy_model.py
def create_model(cloudml_client):
    """
    Creates a Model entity in AI Platform
    :param cloudml_client: discovery client
    """
    models = cloudml_client.projects().models()
    create_spec = {'name': model_name}

    models.create(body=create_spec,
                  parent=project_name).execute()

## predict.py
def validate_model():
    """
    Function to validate the model results
    """
    df_val = pd.read_csv('{}/processed_data/test.csv'.format(job_dir))

    # Submit only 10 samples to the server, ignore the first column (=target column)
    instances = [", ".join(x) for x in df_val.iloc[:10, 1:].astype(str).values.tolist()]
    service = discovery.build('ml', 'v1')
    version_name = 'projects/{}/models/{}'.format(project_id, model_name)

## deepracer_move.py
    def get_headers(self):
        headers = {
            'authority': self.host,
            'x-requested-with': 'XMLHttpRequest',
            'origin': 'https://{}'.format(self.host),
            'x-csrf-token': self.x_csrf_token,
            'user-agent': '...',
            'content-type': 'application/json;charset=UTF-8',
            'accept': '*/*',
            'sec-fetch-site': 'same-origin',
	SELECT
	year,
	month,
	day,
	cumsum,
	ROUND(predicted_label) predicted,
	monthly_cost actual_month_cost,
	ROUND(100 * (ABS(predicted_label - monthly_cost) / monthly_cost),1) abs_err
	FROM
	ML.PREDICT(MODEL `billing_dataset_example.model_linear_regression`,
	CREATE OR REPLACE MODEL
	billing_dataset_example.model_linear_regression --model save path
	OPTIONS
	( model_type='linear_reg', -- As of Aug 2018 you can choose between linear regression and logistic regression
	ls_init_learn_rate=.015,
	l1_reg=0.1,
	l2_reg=0.1,
	data_split_method='seq',
	data_split_col='split_col',
	WITH
	cumsum_table AS(
	SELECT
	account_name, year, month, day,
	ROUND(SUM(daily_cost) OVER (PARTITION BY account_name, year, month ORDER BY day )) AS cumsum,
	ROUND(AVG(daily_cost) OVER (PARTITION BY account_name, year, month ORDER BY day )) AS mean_daily_cost
	FROM
	`billing_dataset_example.billing_daily_monthly`
	ORDER BY account_name, year, month, day),
	monthly_cost_table AS (
	-- https://stackoverflow.com/questions/41707583/get-total-no-of-days-in-given-month-in-google-bigquery
	CREATE TEMP FUNCTION DaysInMonth(d TIMESTAMP) AS (
	32 - EXTRACT(DAY FROM DATE_ADD(DATE_TRUNC(CAST (d AS DATE), MONTH), INTERVAL 31 DAY))
	);

	WITH monthly_cost_table AS (SELECT
	account_name,
	EXTRACT (MONTH FROM bill_datetime) month,
	EXTRACT (YEAR FROM bill_datetime) year,
	SUM(cost) monthly_cost
	WITH dataset AS( SELECT

	EXTRACT(HOUR FROM trip_start_timestamp) trip_start_hour
	, EXTRACT(DAYOFWEEK FROM trip_start_timestamp) trip_start_weekday
	, EXTRACT(WEEK FROM trip_start_timestamp) trip_start_week
	, EXTRACT(DAYOFYEAR FROM trip_start_timestamp) trip_start_yearday
	, EXTRACT(MONTH FROM trip_start_timestamp) trip_start_month
	, (trip_miles * 1.60934 ) / ((trip_seconds + .01) / (60 * 60)) trip_speed_kmph
	, trip_miles
	, pickup_latitude
	{
	"scaleTier": "CUSTOM",
	"masterType": "standard_gpu",
	"args": [
	"--preprocess",
	"--validation_split=0.2",
	"--model_type=regression",
	"--hidden_units=120,60,60",
	"--batch_size=128",
	"--eval_frequency_secs=128",

	def train_hyper_params(cloudml_client, training_inputs):

	job_name = 'chicago_travel_time_training_{}'.format(datetime.utcnow().strftime('%Y%m%d%H%M%S'))
	project_name = 'projects/{}'.format(project_id)
	job_spec = {'jobId': job_name, 'trainingInput': training_inputs}
	response = cloudml_client.projects().jobs().create(body=job_spec,
	parent=project_name).execute()
	print(response)
	def create_model(cloudml_client):
	"""
	Creates a Model entity in AI Platform
	:param cloudml_client: discovery client
	"""
	models = cloudml_client.projects().models()
	create_spec = {'name': model_name}

	models.create(body=create_spec,
	parent=project_name).execute()
	def validate_model():
	"""
	Function to validate the model results
	"""
	df_val = pd.read_csv('{}/processed_data/test.csv'.format(job_dir))

	# Submit only 10 samples to the server, ignore the first column (=target column)
	instances = [", ".join(x) for x in df_val.iloc[:10, 1:].astype(str).values.tolist()]
	service = discovery.build('ml', 'v1')
	version_name = 'projects/{}/models/{}'.format(project_id, model_name)
	def get_headers(self):
	headers = {
	'authority': self.host,
	'x-requested-with': 'XMLHttpRequest',
	'origin': 'https://{}'.format(self.host),
	'x-csrf-token': self.x_csrf_token,
	'user-agent': '...',
	'content-type': 'application/json;charset=UTF-8',
	'accept': '/',
	'sec-fetch-site': 'same-origin',