Tony607/warmup_cosine_decay_scheduler.py

## warmup_cosine_decay_scheduler.py
import numpy as np
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras import backend as K


def cosine_decay_with_warmup(global_step,
                             learning_rate_base,
                             total_steps,
                             warmup_learning_rate=0.0,
                             warmup_steps=0,
                             hold_base_rate_steps=0):
    """Cosine decay schedule with warm up period.

    Cosine annealing learning rate as described in:
      Loshchilov and Hutter, SGDR: Stochastic Gradient Descent with Warm Restarts.
      ICLR 2017. https://arxiv.org/abs/1608.03983
    In this schedule, the learning rate grows linearly from warmup_learning_rate
    to learning_rate_base for warmup_steps, then transitions to a cosine decay
    schedule.

    Arguments:
        global_step {int} -- global step.
        learning_rate_base {float} -- base learning rate.
        total_steps {int} -- total number of training steps.

    Keyword Arguments:
        warmup_learning_rate {float} -- initial learning rate for warm up. (default: {0.0})
        warmup_steps {int} -- number of warmup steps. (default: {0})
        hold_base_rate_steps {int} -- Optional number of steps to hold base learning rate
                                    before decaying. (default: {0})
    Returns:
      a float representing learning rate.

    Raises:
      ValueError: if warmup_learning_rate is larger than learning_rate_base,
        or if warmup_steps is larger than total_steps.
    """

    if total_steps < warmup_steps:
        raise ValueError('total_steps must be larger or equal to '
                         'warmup_steps.')
    learning_rate = 0.5 * learning_rate_base * (1 + np.cos(
        np.pi *
        (global_step - warmup_steps - hold_base_rate_steps
         ) / float(total_steps - warmup_steps - hold_base_rate_steps)))
    if hold_base_rate_steps > 0:
        learning_rate = np.where(global_step > warmup_steps + hold_base_rate_steps,
                                 learning_rate, learning_rate_base)
    if warmup_steps > 0:
        if learning_rate_base < warmup_learning_rate:
            raise ValueError('learning_rate_base must be larger or equal to '
                             'warmup_learning_rate.')
        slope = (learning_rate_base - warmup_learning_rate) / warmup_steps
        warmup_rate = slope * global_step + warmup_learning_rate
        learning_rate = np.where(global_step < warmup_steps, warmup_rate,
                                 learning_rate)
    return np.where(global_step > total_steps, 0.0, learning_rate)


class WarmUpCosineDecayScheduler(keras.callbacks.Callback):
    """Cosine decay with warmup learning rate scheduler
    """

    def __init__(self,
                 learning_rate_base,
                 total_steps,
                 global_step_init=0,
                 warmup_learning_rate=0.0,
                 warmup_steps=0,
                 hold_base_rate_steps=0,
                 verbose=0):
        """Constructor for cosine decay with warmup learning rate scheduler.

    Arguments:
        learning_rate_base {float} -- base learning rate.
        total_steps {int} -- total number of training steps.

    Keyword Arguments:
        global_step_init {int} -- initial global step, e.g. from previous checkpoint.
        warmup_learning_rate {float} -- initial learning rate for warm up. (default: {0.0})
        warmup_steps {int} -- number of warmup steps. (default: {0})
        hold_base_rate_steps {int} -- Optional number of steps to hold base learning rate
                                    before decaying. (default: {0})
        verbose {int} -- 0: quiet, 1: update messages. (default: {0})
        """

        super(WarmUpCosineDecayScheduler, self).__init__()
        self.learning_rate_base = learning_rate_base
        self.total_steps = total_steps
        self.global_step = global_step_init
        self.warmup_learning_rate = warmup_learning_rate
        self.warmup_steps = warmup_steps
        self.hold_base_rate_steps = hold_base_rate_steps
        self.verbose = verbose
        self.learning_rates = []

    def on_batch_end(self, batch, logs=None):
        self.global_step = self.global_step + 1
        lr = K.get_value(self.model.optimizer.lr)
        self.learning_rates.append(lr)

    def on_batch_begin(self, batch, logs=None):
        lr = cosine_decay_with_warmup(global_step=self.global_step,
                                      learning_rate_base=self.learning_rate_base,
                                      total_steps=self.total_steps,
                                      warmup_learning_rate=self.warmup_learning_rate,
                                      warmup_steps=self.warmup_steps,
                                      hold_base_rate_steps=self.hold_base_rate_steps)
        K.set_value(self.model.optimizer.lr, lr)
        if self.verbose > 0:
            print('\nBatch %05d: setting learning '
                  'rate to %s.' % (self.global_step + 1, lr))


# Create a model.
model = Sequential()
model.add(Dense(32, activation='relu', input_dim=100))
model.add(Dense(10, activation='softmax'))
model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Number of training samples.
sample_count = 12

# Total epochs to train.
epochs = 100

# Number of warmup epochs.
warmup_epoch = 10

# Training batch size, set small value here for demonstration purpose.
batch_size = 4

# Base learning rate after warmup.
learning_rate_base = 0.001

total_steps = int(epochs * sample_count / batch_size)

# Compute the number of warmup batches.
warmup_steps = int(warmup_epoch * sample_count / batch_size)

# Generate dummy data.
data = np.random.random((sample_count, 100))
labels = np.random.randint(10, size=(sample_count, 1))

# Convert labels to categorical one-hot encoding.
one_hot_labels = keras.utils.to_categorical(labels, num_classes=10)

# Compute the number of warmup batches.
warmup_batches = warmup_epoch * sample_count / batch_size

# Create the Learning rate scheduler.
warm_up_lr = WarmUpCosineDecayScheduler(learning_rate_base=learning_rate_base,
                                        total_steps=total_steps,
                                        warmup_learning_rate=0.0,
                                        warmup_steps=warmup_steps,
                                        hold_base_rate_steps=0)

# Train the model, iterating on the data in batches of 32 samples
model.fit(data, one_hot_labels, epochs=epochs, batch_size=batch_size,
          verbose=0, callbacks=[warm_up_lr])

import matplotlib.pyplot as plt
plt.plot(warm_up_lr.learning_rates)
plt.xlabel('Step', fontsize=20)
plt.ylabel('lr', fontsize=20)
plt.axis([0, total_steps, 0, learning_rate_base*1.1])
plt.xticks(np.arange(0, total_steps, 50))
plt.grid()
plt.title('Cosine decay with warmup', fontsize=20)
plt.show()
	import numpy as np
	from tensorflow import keras
	from tensorflow.keras.models import Sequential
	from tensorflow.keras.layers import Dense
	from tensorflow.keras import backend as K


	def cosine_decay_with_warmup(global_step,
	learning_rate_base,
	total_steps,
	warmup_learning_rate=0.0,
	warmup_steps=0,
	hold_base_rate_steps=0):
	"""Cosine decay schedule with warm up period.

	Cosine annealing learning rate as described in:
	Loshchilov and Hutter, SGDR: Stochastic Gradient Descent with Warm Restarts.
	ICLR 2017. https://arxiv.org/abs/1608.03983
	In this schedule, the learning rate grows linearly from warmup_learning_rate
	to learning_rate_base for warmup_steps, then transitions to a cosine decay
	schedule.

	Arguments:
	global_step {int} -- global step.
	learning_rate_base {float} -- base learning rate.
	total_steps {int} -- total number of training steps.

	Keyword Arguments:
	warmup_learning_rate {float} -- initial learning rate for warm up. (default: {0.0})
	warmup_steps {int} -- number of warmup steps. (default: {0})
	hold_base_rate_steps {int} -- Optional number of steps to hold base learning rate
	before decaying. (default: {0})
	Returns:
	a float representing learning rate.

	Raises:
	ValueError: if warmup_learning_rate is larger than learning_rate_base,
	or if warmup_steps is larger than total_steps.
	"""

	if total_steps < warmup_steps:
	raise ValueError('total_steps must be larger or equal to '
	'warmup_steps.')
	learning_rate = 0.5 * learning_rate_base * (1 + np.cos(
	np.pi *
	(global_step - warmup_steps - hold_base_rate_steps
	) / float(total_steps - warmup_steps - hold_base_rate_steps)))
	if hold_base_rate_steps > 0:
	learning_rate = np.where(global_step > warmup_steps + hold_base_rate_steps,
	learning_rate, learning_rate_base)
	if warmup_steps > 0:
	if learning_rate_base < warmup_learning_rate:
	raise ValueError('learning_rate_base must be larger or equal to '
	'warmup_learning_rate.')
	slope = (learning_rate_base - warmup_learning_rate) / warmup_steps
	warmup_rate = slope * global_step + warmup_learning_rate
	learning_rate = np.where(global_step < warmup_steps, warmup_rate,
	learning_rate)
	return np.where(global_step > total_steps, 0.0, learning_rate)


	class WarmUpCosineDecayScheduler(keras.callbacks.Callback):
	"""Cosine decay with warmup learning rate scheduler
	"""

	def __init__(self,
	learning_rate_base,
	total_steps,
	global_step_init=0,
	warmup_learning_rate=0.0,
	warmup_steps=0,
	hold_base_rate_steps=0,
	verbose=0):
	"""Constructor for cosine decay with warmup learning rate scheduler.

	Arguments:
	learning_rate_base {float} -- base learning rate.
	total_steps {int} -- total number of training steps.

	Keyword Arguments:
	global_step_init {int} -- initial global step, e.g. from previous checkpoint.
	warmup_learning_rate {float} -- initial learning rate for warm up. (default: {0.0})
	warmup_steps {int} -- number of warmup steps. (default: {0})
	hold_base_rate_steps {int} -- Optional number of steps to hold base learning rate
	before decaying. (default: {0})
	verbose {int} -- 0: quiet, 1: update messages. (default: {0})
	"""

	super(WarmUpCosineDecayScheduler, self).__init__()
	self.learning_rate_base = learning_rate_base
	self.total_steps = total_steps
	self.global_step = global_step_init
	self.warmup_learning_rate = warmup_learning_rate
	self.warmup_steps = warmup_steps
	self.hold_base_rate_steps = hold_base_rate_steps
	self.verbose = verbose
	self.learning_rates = []

	def on_batch_end(self, batch, logs=None):
	self.global_step = self.global_step + 1
	lr = K.get_value(self.model.optimizer.lr)
	self.learning_rates.append(lr)

	def on_batch_begin(self, batch, logs=None):
	lr = cosine_decay_with_warmup(global_step=self.global_step,
	learning_rate_base=self.learning_rate_base,
	total_steps=self.total_steps,
	warmup_learning_rate=self.warmup_learning_rate,
	warmup_steps=self.warmup_steps,
	hold_base_rate_steps=self.hold_base_rate_steps)
	K.set_value(self.model.optimizer.lr, lr)
	if self.verbose > 0:
	print('\nBatch %05d: setting learning '
	'rate to %s.' % (self.global_step + 1, lr))


	# Create a model.
	model = Sequential()
	model.add(Dense(32, activation='relu', input_dim=100))
	model.add(Dense(10, activation='softmax'))
	model.compile(optimizer='rmsprop',
	loss='categorical_crossentropy',
	metrics=['accuracy'])

	# Number of training samples.
	sample_count = 12

	# Total epochs to train.
	epochs = 100

	# Number of warmup epochs.
	warmup_epoch = 10

	# Training batch size, set small value here for demonstration purpose.
	batch_size = 4

	# Base learning rate after warmup.
	learning_rate_base = 0.001

	total_steps = int(epochs * sample_count / batch_size)

	# Compute the number of warmup batches.
	warmup_steps = int(warmup_epoch * sample_count / batch_size)

	# Generate dummy data.
	data = np.random.random((sample_count, 100))
	labels = np.random.randint(10, size=(sample_count, 1))

	# Convert labels to categorical one-hot encoding.
	one_hot_labels = keras.utils.to_categorical(labels, num_classes=10)

	# Compute the number of warmup batches.
	warmup_batches = warmup_epoch * sample_count / batch_size

	# Create the Learning rate scheduler.
	warm_up_lr = WarmUpCosineDecayScheduler(learning_rate_base=learning_rate_base,
	total_steps=total_steps,
	warmup_learning_rate=0.0,
	warmup_steps=warmup_steps,
	hold_base_rate_steps=0)

	# Train the model, iterating on the data in batches of 32 samples
	model.fit(data, one_hot_labels, epochs=epochs, batch_size=batch_size,
	verbose=0, callbacks=[warm_up_lr])

	import matplotlib.pyplot as plt
	plt.plot(warm_up_lr.learning_rates)
	plt.xlabel('Step', fontsize=20)
	plt.ylabel('lr', fontsize=20)
	plt.axis([0, total_steps, 0, learning_rate_base*1.1])
	plt.xticks(np.arange(0, total_steps, 50))
	plt.grid()
	plt.title('Cosine decay with warmup', fontsize=20)
	plt.show()