jainxy/keras_samples.py

## keras_samples.py
"""
    Training
    Validation on a holdout set generated from the original training data
    Evaluation on the test data
 - correct and test batch generation
 - Normalize input by 255?
 - add batchnorm layers? use model(x, training=False) then
 - tf.data.Dataset.from_tensor_slices((x_train, y_train)).batch(batch_size) ? dataset = dataset.cache()?
 - get_compiled_model()
 - test last batch having non-dividing batch-size aka residual batch issue
 - model.evaluate(test_dataset) -> setup command
     - https://keras.io/api/models/model_training_apis/#evaluate-method
 - Try Y-channel only
 - tf.data.dataset.prefetch(buffer_size)
 - tf.one_hot(y,num_classes) to get tensor form
 - NN model for tabular data
 - Checkpoint

"""
# ======================================================DATA
# Preprocess the data (these are NumPy arrays)
x_train = x_train.reshape(60000, 784).astype("float32") / 255
x_test = x_test.reshape(10000, 784).astype("float32") / 255

y_train = y_train.astype("float32")
y_test = y_test.astype("float32")

# Reserve 10,000 samples for validation
x_val = x_train[-10000:]
y_val = y_train[-10000:]
x_train = x_train[:-10000]
y_train = y_train[:-10000]

# ======================================================DATASET
ds_counter = tf.data.Dataset.from_generator(count, args=[25], output_types=tf.int32, output_shapes = (), )
# ======================================================COMPILE
model.compile(
    optimizer=keras.optimizers.RMSprop(learning_rate=1e-3),
    loss=keras.losses.SparseCategoricalCrossentropy(),
    metrics=[keras.metrics.SparseCategoricalAccuracy()],)

model.compile(
    optimizer="rmsprop",
    loss="sparse_categorical_crossentropy",
    metrics=["sparse_categorical_accuracy"],)
## OPTIMIZERS: SGD() (w/ or w/o momentum) - RMSprop() - Adam()
## LOSS: SparseCategoricalCrossentropy() - CategoricalCrossentropy()
## METRICS: AUC() - Precision() - Recall()
# ======================================================LR Decay
initial_learning_rate = 0.1
lr_schedule = keras.optimizers.schedules.ExponentialDecay(initial_learning_rate, decay_steps=100000, decay_rate=0.96, staircase=True)
optimizer = keras.optimizers.RMSprop(learning_rate=lr_schedule)
## Static LR Decays: ExponentialDecay, PiecewiseConstantDecay, PolynomialDecay, and InverseTimeDecay
## Dynamic LR Decays:
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2,
                              patience=5, min_lr=0.0)
model.fit(X_train, Y_train, callbacks=[reduce_lr])
# ======================================================CALLBACKS
# -- General --
# global callback syntax
on_(train|test|predict)_(begin|end)(self, logs=None)
# batch-level
on_(train|test|predict)_batch_(begin|end)(self, batch, logs=None) # For batch_end, logs is a dict containing metrics results
# epoch-level
on_train_(begin|end)(self, epoch, logs=None)
# -- Early stopping --
tf.keras.callbacks.EarlyStopping(patience=1)

# -- Checkpoint --
# Prepare a directory to store all the checkpoints.
checkpoint_dir = "./ckpt"
if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)

def make_or_restore_model():
    # Either restore the latest model, or create a fresh one
    # if there is no checkpoint available.
    checkpoints = [checkpoint_dir + "/" + name for name in os.listdir(checkpoint_dir)]
    if checkpoints:
        latest_checkpoint = max(checkpoints, key=os.path.getctime)
        print("Restoring from", latest_checkpoint)
        return keras.models.load_model(latest_checkpoint)
    print("Creating a new model")
    return get_compiled_model()

model = make_or_restore_model()

callbacks = [
    # This callback saves a SavedModel every epoch
    # We include the current epoch in the folder name.
    keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_dir + "/model_3dcnn_<HP-values>-{epoch}", save_freq="epoch"/100)]
callbacks = [
    keras.callbacks.ModelCheckpoint(
        # Path where to save the model. The two parameters below mean that we will overwrite
        # the current checkpoint if and only if the `val_loss` score has improved.
        # The saved model name will include the current epoch.
        filepath="mymodel_{epoch}",
        save_best_only=True,  # Only save a model if `val_loss` has improved.
        monitor="val_loss",
        verbose=1,)]

# -- Lambda:printlogs after each batch --
from keras.callbacks import LambdaCallback
callbacks = callbacks=[LambdaCallback(on_batch_end=lambda batch,logs:print(logs))]
# ======================================================FIT/TRAIN
print("Fit model on training data")
history = model.fit(
    x_train,
    y_train,
    batch_size=64,
    epochs=2,
    # We pass some validation for
    # monitoring validation loss and metrics
    # at the end of each epoch
    validation_data=(x_val, y_val),
)
history.history
# ======================================================EVALUATE/PREDICT
# Evaluate the model on the test data using `evaluate`
print("Evaluate on test data")
results = model.evaluate(x_test, y_test, batch_size=128)
print("test loss, test acc:", results)
dict(zip(model.metrics_names, result))

# Generate predictions (probabilities -- the output of the last layer)
# on new data using `predict`
print("Generate predictions for 3 samples")
predictions = model.predict(x_test[:3])
print("predictions shape:", predictions.shape)

# ====================================================== Function/Class model
## Create function for model def and compilation, for repeated calls. CAN PARAMETRIZE to customize things during search
## DO IT FOR DATA GENERATR AS WELL

def get_uncompiled_model():
    inputs = keras.Input(shape=(784,), name="digits")
    x = layers.Dense(64, activation="relu", name="dense_1")(inputs)
    x = layers.Dense(64, activation="relu", name="dense_2")(x)
    outputs = layers.Dense(10, activation="softmax", name="predictions")(x)
    model = keras.Model(inputs=inputs, outputs=outputs)
    return model


def get_compiled_model():
    model = get_uncompiled_model()
    model.compile(
        optimizer="rmsprop",
        loss="sparse_categorical_crossentropy",
        metrics=["sparse_categorical_accuracy"],
    )
    return model

# ======================================================BATCHING
"""
if using 'steps_per_epoch' -> create an infinitely-looping Dataset
"""

padded_batch(batch_size, padded_shapes=None, padding_values=None, drop_remainder=False)
dataset = dataset.batch(5).shuffle(3, reshuffle_each_iteration=True).repeat(4)
ds_series = tf.data.Dataset.from_generator(
    gen_series,
    output_types=(tf.int32, tf.float32),
    output_shapes=((), (None,)))

ds_counter = tf.data.Dataset.from_generator(count, args=[25], output_types=tf.int32, output_shapes = (), )

def count(start, end, batch_size):
    sample_count = end-start
    n_batches = int(sample_count//batch_size)
    remainder_samples = sample_count%batch_size
    if remainder_samples>0:
        n_batches = n_batches + 1
    for idx in range(0, n_batches):
        if idx == n_batches - 1:
            pad = random.choices(range(start+idx*batch_size,end), k=(start+batch_size*(idx+1)-end))
            batch = list(range(start+idx*batch_size,end)) + pad
        else:
            batch = list(range(start + idx*batch_size , start + idx*batch_size+batch_size))
        yield batch

ds_counter = tf.data.Dataset.from_generator(count, args=[6,100,5], output_types=tf.int32, output_shapes = (5), )

for count_batch in ds_counter.repeat().batch(10, drop_remainder=False).take(10):
  print(count_batch.numpy())

ds_counter = tf.data.Dataset.from_generator(count, args=[25], output_types=tf.int32, output_shapes = (), )
ds_counter = ds_counter.padded_batch(5, padded_shapes=None, drop_remainder=True).shuffle(100)
for count_batch in ds_counter:
  print(count_batch.numpy())

dataset2 = dataset.padded_batch(2,
    padded_shapes=([4], [None]),
    padding_values=(-1, 100))

## ========================================================EVALUATE/ANALYSE
# Generate generalization metrics
    score = model.evaluate(X_test, targets_test, verbose=0)
    print(f'Test loss: {score[0]} / Test accuracy: {score[1]}')

    # Plot history: Categorical crossentropy & Accuracy
    plt.plot(history.history['loss'], label='Categorical crossentropy (training data)')
    plt.plot(history.history['val_loss'], label='Categorical crossentropy (validation data)')
    plt.plot(history.history['accuracy'], label='Accuracy (training data)')
    plt.plot(history.history['val_accuracy'], label='Accuracy (validation data)')
    plt.title('Model performance for 3D MNIST Keras Conv3D example')
    plt.ylabel('Loss value')
    plt.xlabel('No. epoch')
    plt.legend(loc="upper left")
    plt.show()

##
fig, ax = plt.subplots(1, 2, figsize=(20, 3))
ax = ax.ravel()

for i, metric in enumerate(["acc", "loss"]):
    ax[i].plot(model.history.history[metric])
    ax[i].plot(model.history.history["val_" + metric])
    ax[i].set_title("Model {}".format(metric))
    ax[i].set_xlabel("epochs")
    ax[i].set_ylabel(metric)
    ax[i].legend(["train", "val"])

#======================== Tensorboard
bucket = sagemaker_session.default_bucket()
prefix = 'tensorboard_keras_cifar10'
tensorflow_logs_path = "s3://{}/{}/logs".format(bucket, prefix)

print('Bucket: {}'.format(bucket))
print('SageMaker ver: ' + sagemaker.__version__)
print('Tensorflow ver: ' + tf.__version__)

writer = tf.io.TFRecordWriter(filename)

aws_region = sagemaker_session.boto_region_name
!AWS_REGION={aws_region} tensorboard --logdir {tensorflow_logs_path}
#========================
# Print number of batches ; print epoch# at every 10th epoch; save-model check
# Print lr; # save stdout => checkpoints and logs ; #

"""

"""
# Let's check:
np.testing.assert_allclose(model.predict(test_input), reconstructed_model.predict(test_input))

## DO analysis on source files -> feature generation etc.
	"""
	Training
	Validation on a holdout set generated from the original training data
	Evaluation on the test data
	- correct and test batch generation
	- Normalize input by 255?
	- add batchnorm layers? use model(x, training=False) then
	- tf.data.Dataset.from_tensor_slices((x_train, y_train)).batch(batch_size) ? dataset = dataset.cache()?
	- get_compiled_model()
	- test last batch having non-dividing batch-size aka residual batch issue
	- model.evaluate(test_dataset) -> setup command
	- https://keras.io/api/models/model_training_apis/#evaluate-method
	- Try Y-channel only
	- tf.data.dataset.prefetch(buffer_size)
	- tf.one_hot(y,num_classes) to get tensor form
	- NN model for tabular data
	- Checkpoint

	"""
	# ======================================================DATA
	# Preprocess the data (these are NumPy arrays)
	x_train = x_train.reshape(60000, 784).astype("float32") / 255
	x_test = x_test.reshape(10000, 784).astype("float32") / 255

	y_train = y_train.astype("float32")
	y_test = y_test.astype("float32")

	# Reserve 10,000 samples for validation
	x_val = x_train[-10000:]
	y_val = y_train[-10000:]
	x_train = x_train[:-10000]
	y_train = y_train[:-10000]

	# ======================================================DATASET
	ds_counter = tf.data.Dataset.from_generator(count, args=[25], output_types=tf.int32, output_shapes = (), )
	# ======================================================COMPILE
	model.compile(
	optimizer=keras.optimizers.RMSprop(learning_rate=1e-3),
	loss=keras.losses.SparseCategoricalCrossentropy(),
	metrics=[keras.metrics.SparseCategoricalAccuracy()],)

	model.compile(
	optimizer="rmsprop",
	loss="sparse_categorical_crossentropy",
	metrics=["sparse_categorical_accuracy"],)
	## OPTIMIZERS: SGD() (w/ or w/o momentum) - RMSprop() - Adam()
	## LOSS: SparseCategoricalCrossentropy() - CategoricalCrossentropy()
	## METRICS: AUC() - Precision() - Recall()
	# ======================================================LR Decay
	initial_learning_rate = 0.1
	lr_schedule = keras.optimizers.schedules.ExponentialDecay(initial_learning_rate, decay_steps=100000, decay_rate=0.96, staircase=True)
	optimizer = keras.optimizers.RMSprop(learning_rate=lr_schedule)
	## Static LR Decays: ExponentialDecay, PiecewiseConstantDecay, PolynomialDecay, and InverseTimeDecay
	## Dynamic LR Decays:
	reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2,
	patience=5, min_lr=0.0)
	model.fit(X_train, Y_train, callbacks=[reduce_lr])
	# ======================================================CALLBACKS
	# -- General --
	# global callback syntax
	on_(train\|test\|predict)_(begin\|end)(self, logs=None)
	# batch-level
	on_(train\|test\|predict)_batch_(begin\|end)(self, batch, logs=None) # For batch_end, logs is a dict containing metrics results
	# epoch-level
	on_train_(begin\|end)(self, epoch, logs=None)
	# -- Early stopping --
	tf.keras.callbacks.EarlyStopping(patience=1)

	# -- Checkpoint --
	# Prepare a directory to store all the checkpoints.
	checkpoint_dir = "./ckpt"
	if not os.path.exists(checkpoint_dir):
	os.makedirs(checkpoint_dir)

	def make_or_restore_model():
	# Either restore the latest model, or create a fresh one
	# if there is no checkpoint available.
	checkpoints = [checkpoint_dir + "/" + name for name in os.listdir(checkpoint_dir)]
	if checkpoints:
	latest_checkpoint = max(checkpoints, key=os.path.getctime)
	print("Restoring from", latest_checkpoint)
	return keras.models.load_model(latest_checkpoint)
	print("Creating a new model")
	return get_compiled_model()

	model = make_or_restore_model()

	callbacks = [
	# This callback saves a SavedModel every epoch
	# We include the current epoch in the folder name.
	keras.callbacks.ModelCheckpoint(
	filepath=checkpoint_dir + "/model_3dcnn_<HP-values>-{epoch}", save_freq="epoch"/100)]
	callbacks = [
	keras.callbacks.ModelCheckpoint(
	# Path where to save the model. The two parameters below mean that we will overwrite
	# the current checkpoint if and only if the `val_loss` score has improved.
	# The saved model name will include the current epoch.
	filepath="mymodel_{epoch}",
	save_best_only=True, # Only save a model if `val_loss` has improved.
	monitor="val_loss",
	verbose=1,)]

	# -- Lambda:printlogs after each batch --
	from keras.callbacks import LambdaCallback
	callbacks = callbacks=[LambdaCallback(on_batch_end=lambda batch,logs:print(logs))]
	# ======================================================FIT/TRAIN
	print("Fit model on training data")
	history = model.fit(
	x_train,
	y_train,
	batch_size=64,
	epochs=2,
	# We pass some validation for
	# monitoring validation loss and metrics
	# at the end of each epoch
	validation_data=(x_val, y_val),
	)
	history.history
	# ======================================================EVALUATE/PREDICT
	# Evaluate the model on the test data using `evaluate`
	print("Evaluate on test data")
	results = model.evaluate(x_test, y_test, batch_size=128)
	print("test loss, test acc:", results)
	dict(zip(model.metrics_names, result))

	# Generate predictions (probabilities -- the output of the last layer)
	# on new data using `predict`
	print("Generate predictions for 3 samples")
	predictions = model.predict(x_test[:3])
	print("predictions shape:", predictions.shape)

	# ====================================================== Function/Class model
	## Create function for model def and compilation, for repeated calls. CAN PARAMETRIZE to customize things during search
	## DO IT FOR DATA GENERATR AS WELL

	def get_uncompiled_model():
	inputs = keras.Input(shape=(784,), name="digits")
	x = layers.Dense(64, activation="relu", name="dense_1")(inputs)
	x = layers.Dense(64, activation="relu", name="dense_2")(x)
	outputs = layers.Dense(10, activation="softmax", name="predictions")(x)
	model = keras.Model(inputs=inputs, outputs=outputs)
	return model


	def get_compiled_model():
	model = get_uncompiled_model()
	model.compile(
	optimizer="rmsprop",
	loss="sparse_categorical_crossentropy",
	metrics=["sparse_categorical_accuracy"],
	)
	return model

	# ======================================================BATCHING
	"""
	if using 'steps_per_epoch' -> create an infinitely-looping Dataset
	"""

	padded_batch(batch_size, padded_shapes=None, padding_values=None, drop_remainder=False)
	dataset = dataset.batch(5).shuffle(3, reshuffle_each_iteration=True).repeat(4)
	ds_series = tf.data.Dataset.from_generator(
	gen_series,
	output_types=(tf.int32, tf.float32),
	output_shapes=((), (None,)))

	ds_counter = tf.data.Dataset.from_generator(count, args=[25], output_types=tf.int32, output_shapes = (), )

	def count(start, end, batch_size):
	sample_count = end-start
	n_batches = int(sample_count//batch_size)
	remainder_samples = sample_count%batch_size
	if remainder_samples>0:
	n_batches = n_batches + 1
	for idx in range(0, n_batches):
	if idx == n_batches - 1:
	pad = random.choices(range(start+idxbatch_size,end), k=(start+batch_size(idx+1)-end))
	batch = list(range(start+idx*batch_size,end)) + pad
	else:
	batch = list(range(start + idxbatch_size , start + idxbatch_size+batch_size))
	yield batch

	ds_counter = tf.data.Dataset.from_generator(count, args=[6,100,5], output_types=tf.int32, output_shapes = (5), )

	for count_batch in ds_counter.repeat().batch(10, drop_remainder=False).take(10):
	print(count_batch.numpy())

	ds_counter = tf.data.Dataset.from_generator(count, args=[25], output_types=tf.int32, output_shapes = (), )
	ds_counter = ds_counter.padded_batch(5, padded_shapes=None, drop_remainder=True).shuffle(100)
	for count_batch in ds_counter:
	print(count_batch.numpy())

	dataset2 = dataset.padded_batch(2,
	padded_shapes=([4], [None]),
	padding_values=(-1, 100))

	## ========================================================EVALUATE/ANALYSE
	# Generate generalization metrics
	score = model.evaluate(X_test, targets_test, verbose=0)
	print(f'Test loss: {score[0]} / Test accuracy: {score[1]}')

	# Plot history: Categorical crossentropy & Accuracy
	plt.plot(history.history['loss'], label='Categorical crossentropy (training data)')
	plt.plot(history.history['val_loss'], label='Categorical crossentropy (validation data)')
	plt.plot(history.history['accuracy'], label='Accuracy (training data)')
	plt.plot(history.history['val_accuracy'], label='Accuracy (validation data)')
	plt.title('Model performance for 3D MNIST Keras Conv3D example')
	plt.ylabel('Loss value')
	plt.xlabel('No. epoch')
	plt.legend(loc="upper left")
	plt.show()

	##
	fig, ax = plt.subplots(1, 2, figsize=(20, 3))
	ax = ax.ravel()

	for i, metric in enumerate(["acc", "loss"]):
	ax[i].plot(model.history.history[metric])
	ax[i].plot(model.history.history["val_" + metric])
	ax[i].set_title("Model {}".format(metric))
	ax[i].set_xlabel("epochs")
	ax[i].set_ylabel(metric)
	ax[i].legend(["train", "val"])

	#======================== Tensorboard
	bucket = sagemaker_session.default_bucket()
	prefix = 'tensorboard_keras_cifar10'
	tensorflow_logs_path = "s3://{}/{}/logs".format(bucket, prefix)

	print('Bucket: {}'.format(bucket))
	print('SageMaker ver: ' + sagemaker.__version__)
	print('Tensorflow ver: ' + tf.__version__)

	writer = tf.io.TFRecordWriter(filename)

	aws_region = sagemaker_session.boto_region_name
	!AWS_REGION={aws_region} tensorboard --logdir {tensorflow_logs_path}
	#========================
	# Print number of batches ; print epoch# at every 10th epoch; save-model check
	# Print lr; # save stdout => checkpoints and logs ; #

	"""

	"""
	# Let's check:
	np.testing.assert_allclose(model.predict(test_input), reconstructed_model.predict(test_input))

	## DO analysis on source files -> feature generation etc.