Skip to content

Instantly share code, notes, and snippets.

@arivero
Last active March 2, 2024 20:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save arivero/155ebcff7f1a1cd27be0bd80dce739c2 to your computer and use it in GitHub Desktop.
Save arivero/155ebcff7f1a1cd27be0bd80dce739c2 to your computer and use it in GitHub Desktop.
Grokking example, in train dataset
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
def create_synthetic_data(input_dim=1024, num_samples=10000):
num_classes = input_dim
y = np.random.randint(0, num_classes, size=(num_samples,))
y= tf.keras.utils.to_categorical(y, num_classes)
X = y
return X, y
input_dim=1024
num_samples = 10000
X, y = create_synthetic_data(input_dim=input_dim, num_samples=num_samples)
# Custom callback for detailed per-step logging
class DetailedLoggingCallback(tf.keras.callbacks.Callback):
def on_train_begin(self, logs=None):
self.step_accuracy = []
self.learning_rate = []
def on_train_batch_begin(self, batch, logs=None):
# Reset the metric at the start of each batch
self.model.reset_metrics()
def on_train_batch_end(self, batch, logs=None):
# Log the metric at the end of each batch
self.step_accuracy.append(logs.get('accuracy'))
self.learning_rate.append(self.model.optimizer.iterations.numpy())
activators = ['selu', 'linear', 'elu', 'tanh', 'leaky_relu', 'softsign', 'relu6', 'relu', 'gelu', 'swish', 'softplus', 'sigmoid', 'hard_sigmoid', 'exponential']
activators = ['selu','tanh','linear','relu', 'gelu']
dtypes = [tf.float32, tf.float16, tf.float64]
def create_model(input_dim, num_layers, activation='relu', dtype=tf.float32):
model = tf.keras.Sequential()
model.add(tf.keras.Input(shape=(input_dim,), dtype=dtype))
for _ in range(num_layers):
model.add(tf.keras.layers.Dense(128, activation=activation, dtype=dtype))
model.add(tf.keras.layers.Dense(input_dim, activation='softmax', dtype=dtype)) #sigmoid seria para multiclass
return model
#plt.rcParams.update({'axes.facecolor': 'white', 'figure.facecolor': 'white'})
input_dim = 1024
nlayers = 12
for tipo in dtypes:
for opt in ['RMSprop', 'Adam', 'SGD', 'Adagrad', 'Adadelta', 'Adamax', 'Nadam', 'Ftrl']:
plt.figure(figsize=(12, 8))
order = {}
line={}
for act in activators:
model = create_model(input_dim=input_dim, num_layers=nlayers, activation=act, dtype=tipo)
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
detailed_logging_callback = DetailedLoggingCallback()
model.fit(X, y, epochs=2*nlayers*nlayers, batch_size=256, verbose=0, callbacks=[detailed_logging_callback])
smoothed_accuracy = detailed_logging_callback.step_accuracy
lr = detailed_logging_callback.learning_rate
plt.scatter(range(len(smoothed_accuracy)), smoothed_accuracy, label=f'{act}',s=1)
plt.xlabel('Training Step')
plt.ylabel('Accuracy')
plt.title(f'Per-Step {tipo.name} {opt} Training Accuracy for {nlayers} Layers and Different Activators')
plt.legend()
plt.show()
@arivero
Copy link
Author

arivero commented Mar 2, 2024

At least in Keras, it can be seen that gelu and relu have some tendency to exhibit delayed learning, but a peculiar thing is that it is exaggerated by the optimiser. Compare the training of a gelu fully connected network of 12 layers when trained with the defaults RMSprop and Adam.

image image

@arivero
Copy link
Author

arivero commented Mar 2, 2024

The delay disappears, or at least decreases, if we do not use bias in the embedding layer.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment