kenthua/mpg.py

## mpg.py
import ray
import requests

# runtime_env here doesn't seem to do anything as we still miss the tf module
ray.init(address="ray://ray-head-svc:10001")

import numpy as np
import pandas as pd
import pathlib
import tensorflow as tf
import os

from tensorflow import keras
from tensorflow.keras import layers

print(tf.__version__)
print("Core GPU?: " + str(tf.config.list_physical_devices('GPU')))
print("Core Hostname: " + os.uname().nodename)

os.environ["BUCKET"] = "gs://kh"


def norm(x, train_stats):
    return (x - train_stats['mean']) / train_stats['std']

def build_model(train_dataset):
    model = keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=[len(train_dataset.keys())]),
    layers.Dense(64, activation='relu'),
    layers.Dense(1)
    ])

    optimizer = tf.keras.optimizers.RMSprop(0.001)

    model.compile(loss='mse',
                optimizer=optimizer,
                metrics=['mae', 'mse'])
    return model

@ray.remote()
class MPG:
    print("MPG Class Core GPU?: " + str(tf.config.list_physical_devices('GPU')))
    print("MPG Class Core Hostname: " + os.uname().nodename)

    """## The Auto MPG dataset

    The dataset is available from the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/).

    ### Get the data
    First download the dataset.
    """

    dataset_path = keras.utils.get_file("auto-mpg.data", "http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data")
    dataset_path

    """Import it using pandas"""

    column_names = ['MPG','Cylinders','Displacement','Horsepower','Weight',
                    'Acceleration', 'Model Year', 'Origin']
    dataset = pd.read_csv(dataset_path, names=column_names,
                        na_values = "?", comment='\t',
                        sep=" ", skipinitialspace=True)

    dataset.tail()

    # TODO: replace `your-gcs-bucket` with the name of the Storage bucket you created earlier
    BUCKET = os.environ["BUCKET"]

    """### Clean the data

    The dataset contains a few unknown values.
    """

    dataset.isna().sum()

    """To keep this initial tutorial simple, drop those rows."""

    dataset = dataset.dropna()

    """The `"Origin"` column is really categorical, not numeric. So convert that to a one-hot:"""

    dataset['Origin'] = dataset['Origin'].map({1: 'USA', 2: 'Europe', 3: 'Japan'})

    dataset = pd.get_dummies(dataset, prefix='', prefix_sep='')
    dataset.tail()

    """### Split the data into train and test

    Now split the dataset into a training set and a test set.

    You will use the test set in the final evaluation of your model.
    """

    train_dataset = dataset.sample(frac=0.8,random_state=0)
    test_dataset = dataset.drop(train_dataset.index)

    """### Inspect the data

    Have a quick look at the joint distribution of a few pairs of columns from the training set.

    Also look at the overall statistics:
    """

    train_stats = train_dataset.describe()
    train_stats.pop("MPG")
    train_stats = train_stats.transpose()
    train_stats

    """### Split features from labels

    Separate the target value, or "label", from the features. This label is the value that you will train the model to predict.
    """

    train_labels = train_dataset.pop('MPG')
    test_labels = test_dataset.pop('MPG')

    """### Normalize the data

    Look again at the `train_stats` block above and note how different the ranges of each feature are.

    It is good practice to normalize features that use different scales and ranges. Although the model *might* converge without feature normalization, it makes training more difficult, and it makes the resulting model dependent on the choice of units used in the input.

    Note: Although we intentionally generate these statistics from only the training dataset, these statistics will also be used to normalize the test dataset. We need to do that to project the test dataset into the same distribution that the model has been trained on.
    """

    normed_train_data = norm(train_dataset, train_stats)
    normed_test_data = norm(test_dataset, train_stats)

    """This normalized data is what we will use to train the model.

    Caution: The statistics used to normalize the inputs here (mean and standard deviation) need to be applied to any other data that is fed to the model, along with the one-hot encoding that we did earlier. That includes the test set as well as live data when the model is used in production.

    ## The model

    ### Build the model

    Let's build our model. Here, we'll use a `Sequential` model with two densely connected hidden layers, and an output layer that returns a single, continuous value. The model building steps are wrapped in a function, `build_model`, since we'll create a second model later on.
    """

    model = build_model(train_dataset)

    """### Inspect the model

    Use the `.summary` method to print a simple description of the model
    """

    model.summary()

    """Now try out the model. Take a batch of `10` examples from the training data and call `model.predict` on it.

    It seems to be working, and it produces a result of the expected shape and type.

    ### Train the model

    Train the model for 1000 epochs, and record the training and validation accuracy in the `history` object.

    Visualize the model's training progress using the stats stored in the `history` object.

    This graph shows little improvement, or even degradation, in the validation error after about 100 epochs. Let's update the `model.fit` call to automatically stop training when the validation score doesn't improve. We'll use an *EarlyStopping callback* that tests a training condition for every epoch. If a set amount of epochs elapses without showing improvement, then it will automatically stop the training.

    You can learn more about this callback [here](https://www.tensorflow.org/api_docs/python/tf/keras/callbacks/EarlyStopping).
    """

    model = build_model(train_dataset)

    EPOCHS = 4000

    # The patience parameter is the amount of epochs to check for improvement
    early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)

    early_history = model.fit(normed_train_data, train_labels,
                        epochs=EPOCHS, validation_split = 0.2,
                        callbacks=[early_stop])

    print("MPG Class Core GPU?: " + str(tf.config.list_physical_devices('GPU')))
    print("MPG Class Core Hostname: " + os.uname().nodename)

    # Export model and save to GCS
    model.save(BUCKET + '/mpg/model')

mpg = MPG.remote()
	import ray
	import requests

	# runtime_env here doesn't seem to do anything as we still miss the tf module
	ray.init(address="ray://ray-head-svc:10001")

	import numpy as np
	import pandas as pd
	import pathlib
	import tensorflow as tf
	import os

	from tensorflow import keras
	from tensorflow.keras import layers

	print(tf.__version__)
	print("Core GPU?: " + str(tf.config.list_physical_devices('GPU')))
	print("Core Hostname: " + os.uname().nodename)

	os.environ["BUCKET"] = "gs://kh"


	def norm(x, train_stats):
	return (x - train_stats['mean']) / train_stats['std']

	def build_model(train_dataset):
	model = keras.Sequential([
	layers.Dense(64, activation='relu', input_shape=[len(train_dataset.keys())]),
	layers.Dense(64, activation='relu'),
	layers.Dense(1)
	])

	optimizer = tf.keras.optimizers.RMSprop(0.001)

	model.compile(loss='mse',
	optimizer=optimizer,
	metrics=['mae', 'mse'])
	return model

	@ray.remote()
	class MPG:
	print("MPG Class Core GPU?: " + str(tf.config.list_physical_devices('GPU')))
	print("MPG Class Core Hostname: " + os.uname().nodename)

	"""## The Auto MPG dataset

	The dataset is available from the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/).

	### Get the data
	First download the dataset.
	"""

	dataset_path = keras.utils.get_file("auto-mpg.data", "http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data")
	dataset_path

	"""Import it using pandas"""

	column_names = ['MPG','Cylinders','Displacement','Horsepower','Weight',
	'Acceleration', 'Model Year', 'Origin']
	dataset = pd.read_csv(dataset_path, names=column_names,
	na_values = "?", comment='\t',
	sep=" ", skipinitialspace=True)

	dataset.tail()

	# TODO: replace `your-gcs-bucket` with the name of the Storage bucket you created earlier
	BUCKET = os.environ["BUCKET"]

	"""### Clean the data

	The dataset contains a few unknown values.
	"""

	dataset.isna().sum()

	"""To keep this initial tutorial simple, drop those rows."""

	dataset = dataset.dropna()

	"""The `"Origin"` column is really categorical, not numeric. So convert that to a one-hot:"""

	dataset['Origin'] = dataset['Origin'].map({1: 'USA', 2: 'Europe', 3: 'Japan'})

	dataset = pd.get_dummies(dataset, prefix='', prefix_sep='')
	dataset.tail()

	"""### Split the data into train and test

	Now split the dataset into a training set and a test set.

	You will use the test set in the final evaluation of your model.
	"""

	train_dataset = dataset.sample(frac=0.8,random_state=0)
	test_dataset = dataset.drop(train_dataset.index)

	"""### Inspect the data

	Have a quick look at the joint distribution of a few pairs of columns from the training set.

	Also look at the overall statistics:
	"""

	train_stats = train_dataset.describe()
	train_stats.pop("MPG")
	train_stats = train_stats.transpose()
	train_stats

	"""### Split features from labels

	Separate the target value, or "label", from the features. This label is the value that you will train the model to predict.
	"""

	train_labels = train_dataset.pop('MPG')
	test_labels = test_dataset.pop('MPG')

	"""### Normalize the data

	Look again at the `train_stats` block above and note how different the ranges of each feature are.

	It is good practice to normalize features that use different scales and ranges. Although the model might converge without feature normalization, it makes training more difficult, and it makes the resulting model dependent on the choice of units used in the input.

	Note: Although we intentionally generate these statistics from only the training dataset, these statistics will also be used to normalize the test dataset. We need to do that to project the test dataset into the same distribution that the model has been trained on.
	"""

	normed_train_data = norm(train_dataset, train_stats)
	normed_test_data = norm(test_dataset, train_stats)

	"""This normalized data is what we will use to train the model.

	Caution: The statistics used to normalize the inputs here (mean and standard deviation) need to be applied to any other data that is fed to the model, along with the one-hot encoding that we did earlier. That includes the test set as well as live data when the model is used in production.

	## The model

	### Build the model

	Let's build our model. Here, we'll use a `Sequential` model with two densely connected hidden layers, and an output layer that returns a single, continuous value. The model building steps are wrapped in a function, `build_model`, since we'll create a second model later on.
	"""

	model = build_model(train_dataset)

	"""### Inspect the model

	Use the `.summary` method to print a simple description of the model
	"""

	model.summary()

	"""Now try out the model. Take a batch of `10` examples from the training data and call `model.predict` on it.

	It seems to be working, and it produces a result of the expected shape and type.

	### Train the model

	Train the model for 1000 epochs, and record the training and validation accuracy in the `history` object.

	Visualize the model's training progress using the stats stored in the `history` object.

	This graph shows little improvement, or even degradation, in the validation error after about 100 epochs. Let's update the `model.fit` call to automatically stop training when the validation score doesn't improve. We'll use an EarlyStopping callback that tests a training condition for every epoch. If a set amount of epochs elapses without showing improvement, then it will automatically stop the training.

	You can learn more about this callback [here](https://www.tensorflow.org/api_docs/python/tf/keras/callbacks/EarlyStopping).
	"""

	model = build_model(train_dataset)

	EPOCHS = 4000

	# The patience parameter is the amount of epochs to check for improvement
	early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)

	early_history = model.fit(normed_train_data, train_labels,
	epochs=EPOCHS, validation_split = 0.2,
	callbacks=[early_stop])

	print("MPG Class Core GPU?: " + str(tf.config.list_physical_devices('GPU')))
	print("MPG Class Core Hostname: " + os.uname().nodename)

	# Export model and save to GCS
	model.save(BUCKET + '/mpg/model')

	mpg = MPG.remote()