Last active
October 26, 2021 19:41
-
-
Save SLAPaper/762ebc26a421d6465a92d6811feffde9 to your computer and use it in GitHub Desktop.
Compare RELU, ELU, SELU, Swish and Scaled Swish in Reuters MLP (based on Keras' example)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
'''Compares self-normalizing MLPs with regular MLPs. | |
Compares the performance of a simple MLP using two | |
different activation functions: RELU and SELU | |
on the Reuters newswire topic classification task. | |
# Reference: | |
Klambauer, G., Unterthiner, T., Mayr, A., & Hochreiter, S. (2017). | |
Self-Normalizing Neural Networks. arXiv preprint arXiv:1706.02515. | |
https://arxiv.org/abs/1706.02515 | |
Prajit R., Barret Z., Quoc V. L. (2017). | |
Swish: A Self-Gated Activation Function. arXiv preprint arXiv:1710.05941. | |
https://arxiv.org/abs/1710.05941 | |
''' | |
from __future__ import print_function | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import keras | |
from keras.datasets import reuters | |
from keras.models import Sequential | |
from keras.layers import Dense, Activation, Dropout | |
from keras.layers.noise import AlphaDropout | |
from keras.preprocessing.text import Tokenizer | |
from keras import backend as K | |
from numpy import random | |
from keras.regularizers import l2 | |
max_words = 1000 | |
batch_size = 32 | |
epochs = 40 | |
plot = True | |
random.seed(777) | |
def create_network(n_dense=6, | |
dense_units=16, | |
activation='selu', | |
dropout=AlphaDropout, | |
dropout_rate=0.1, | |
kernel_initializer='lecun_normal', | |
optimizer='adam', | |
num_classes=1, | |
max_words=max_words, | |
regularizer=l2()): | |
"""Generic function to create a fully-connected neural network. | |
# Arguments | |
n_dense: int > 0. Number of dense layers. | |
dense_units: int > 0. Number of dense units per layer. | |
dropout: keras.layers.Layer. A dropout layer to apply. | |
dropout_rate: 0 <= float <= 1. The rate of dropout. | |
kernel_initializer: str. The initializer for the weights. | |
optimizer: str/keras.optimizers.Optimizer. The optimizer to use. | |
num_classes: int > 0. The number of classes to predict. | |
max_words: int > 0. The maximum number of words per data point. | |
# Returns | |
A Keras model instance (compiled). | |
""" | |
model = Sequential() | |
model.add( | |
Dense( | |
dense_units, | |
input_shape=(max_words, ), | |
kernel_initializer=kernel_initializer, | |
kernel_regularizer=regularizer)) | |
model.add(Activation(activation)) | |
model.add(dropout(dropout_rate)) | |
for i in range(n_dense - 1): | |
model.add( | |
Dense( | |
dense_units, | |
kernel_initializer=kernel_initializer, | |
kernel_regularizer=regularizer)) | |
model.add(Activation(activation)) | |
model.add(dropout(dropout_rate)) | |
model.add(Dense(num_classes)) | |
model.add(Activation('softmax')) | |
model.compile( | |
loss='categorical_crossentropy', | |
optimizer=optimizer, | |
metrics=['accuracy']) | |
return model | |
network0 = { | |
'n_dense': 6, | |
'dense_units': 16, | |
'activation': 'relu', | |
'dropout': Dropout, | |
'dropout_rate': 0.5, | |
'kernel_initializer': 'glorot_uniform', | |
'optimizer': 'adam' | |
} | |
network1 = { | |
'n_dense': 6, | |
'dense_units': 16, | |
'activation': 'elu', | |
'dropout': Dropout, | |
'dropout_rate': 0.5, | |
'kernel_initializer': 'he_normal', | |
'optimizer': 'adam' | |
} | |
network2 = { | |
'n_dense': 6, | |
'dense_units': 16, | |
'activation': 'selu', | |
'dropout': AlphaDropout, | |
'dropout_rate': 0.1, | |
'kernel_initializer': 'lecun_normal', | |
'optimizer': 'adam' | |
} | |
network3 = { | |
'n_dense': 6, | |
'dense_units': 16, | |
'activation': lambda x: x * K.sigmoid(x), | |
'dropout': Dropout, | |
'dropout_rate': 0.5, | |
'kernel_initializer': 'he_normal', | |
'optimizer': 'adam' | |
} | |
print('Loading data...') | |
(x_train, y_train), (x_test, y_test) = reuters.load_data( | |
num_words=max_words, test_split=0.2) | |
print(len(x_train), 'train sequences') | |
print(len(x_test), 'test sequences') | |
num_classes = np.max(y_train) + 1 | |
print(num_classes, 'classes') | |
print('Vectorizing sequence data...') | |
tokenizer = Tokenizer(num_words=max_words) | |
x_train = tokenizer.sequences_to_matrix(x_train, mode='binary') | |
x_test = tokenizer.sequences_to_matrix(x_test, mode='binary') | |
print('x_train shape:', x_train.shape) | |
print('x_test shape:', x_test.shape) | |
print('Convert class vector to binary class matrix ' | |
'(for use with categorical_crossentropy)') | |
y_train = keras.utils.to_categorical(y_train, num_classes) | |
y_test = keras.utils.to_categorical(y_test, num_classes) | |
print('y_train shape:', y_train.shape) | |
print('y_test shape:', y_test.shape) | |
print('\nBuilding network 0 (RELU)...') | |
model0 = create_network(num_classes=num_classes, **network0) | |
history_model0 = model0.fit( | |
x_train, | |
y_train, | |
batch_size=batch_size, | |
epochs=epochs, | |
verbose=1, | |
validation_split=0.1) | |
score_model0 = model0.evaluate( | |
x_test, y_test, batch_size=batch_size, verbose=1) | |
print('\nBuilding network 1 (ELU)...') | |
model1 = create_network(num_classes=num_classes, **network1) | |
history_model1 = model1.fit( | |
x_train, | |
y_train, | |
batch_size=batch_size, | |
epochs=epochs, | |
verbose=1, | |
validation_split=0.1) | |
score_model1 = model1.evaluate( | |
x_test, y_test, batch_size=batch_size, verbose=1) | |
print('\nBuilding network 2 (SELU)...') | |
model2 = create_network(num_classes=num_classes, **network2) | |
history_model2 = model2.fit( | |
x_train, | |
y_train, | |
batch_size=batch_size, | |
epochs=epochs, | |
verbose=1, | |
validation_split=0.1) | |
score_model2 = model2.evaluate( | |
x_test, y_test, batch_size=batch_size, verbose=1) | |
print('\nBuilding network 3 (Swish)...') | |
model3 = create_network(num_classes=num_classes, **network3) | |
history_model3 = model3.fit( | |
x_train, | |
y_train, | |
batch_size=batch_size, | |
epochs=epochs, | |
verbose=1, | |
validation_split=0.1) | |
score_model3 = model3.evaluate( | |
x_test, y_test, batch_size=batch_size, verbose=1) | |
print('\nNetwork 0 (RELU) results') | |
print('Hyperparameters:', network0) | |
print('Test score:', score_model0[0]) | |
print('Test accuracy:', score_model0[1]) | |
print('Network 1 (ELU) results') | |
print('Hyperparameters:', network1) | |
print('Test score:', score_model1[0]) | |
print('Test accuracy:', score_model1[1]) | |
print('Network 2 (SELU) results') | |
print('Hyperparameters:', network2) | |
print('Test score:', score_model2[0]) | |
print('Test accuracy:', score_model2[1]) | |
print('Network 3 (Swish) results') | |
print('Hyperparameters:', network3) | |
print('Test score:', score_model3[0]) | |
print('Test accuracy:', score_model3[1]) | |
ax1 = plt.subplot(2, 1, 1) | |
ax1.plot( | |
range(epochs), | |
history_model0.history['val_loss'], | |
'r-', | |
label='Network 0 (RELU) Val Loss') | |
ax1.plot( | |
range(epochs), | |
history_model1.history['val_loss'], | |
'g-', | |
label='Network 1 (ELU) Val Loss') | |
ax1.plot( | |
range(epochs), | |
history_model2.history['val_loss'], | |
'b-', | |
label='Network 2 (SELU) Val Loss') | |
ax1.plot( | |
range(epochs), | |
history_model3.history['val_loss'], | |
'c-', | |
label='Network 3 (Swish) Val Loss') | |
ax1.set_ylabel('Validation Loss') | |
ax1.set_ylim(0) | |
ax1.legend(fontsize='x-small') | |
ax2 = plt.subplot(2, 1, 2) | |
ax2.plot( | |
range(epochs), | |
history_model0.history['loss'], | |
'r-', | |
label='Network 0 (RELU) Train Loss') | |
ax2.plot( | |
range(epochs), | |
history_model1.history['loss'], | |
'g-', | |
label='Network 1 (ELU) Train Loss') | |
ax2.plot( | |
range(epochs), | |
history_model2.history['loss'], | |
'b-', | |
label='Network 2 (SELU) Train Loss') | |
ax2.plot( | |
range(epochs), | |
history_model3.history['loss'], | |
'c-', | |
label='Network 3 (Swish) Train Loss') | |
ax2.set_ylabel('Training Loss') | |
ax2.set_ylim(0) | |
ax2.legend(fontsize='x-small') | |
plt.xlabel('Epochs') | |
plt.savefig('comparison_of_networks.png') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
'''Compares self-normalizing MLPs with regular MLPs. | |
Compares the performance of a simple MLP using two | |
different activation functions: RELU and SELU | |
on the Reuters newswire topic classification task. | |
# Reference: | |
Klambauer, G., Unterthiner, T., Mayr, A., & Hochreiter, S. (2017). | |
Self-Normalizing Neural Networks. arXiv preprint arXiv:1706.02515. | |
https://arxiv.org/abs/1706.02515 | |
Prajit R., Barret Z., Quoc V. L. (2017). | |
Swish: A Self-Gated Activation Function. arXiv preprint arXiv:1710.05941. | |
https://arxiv.org/abs/1710.05941 | |
''' | |
from __future__ import print_function | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import keras | |
from keras.datasets import reuters | |
from keras.models import Sequential | |
from keras.layers import Dense, Activation, Dropout | |
from keras.layers.noise import AlphaDropout | |
from keras.preprocessing.text import Tokenizer | |
from keras import backend as K | |
from numpy import random | |
max_words = 1000 | |
batch_size = 16 | |
epochs = 40 | |
plot = True | |
random.seed(777) | |
def create_network(n_dense=6, | |
dense_units=16, | |
activation='selu', | |
dropout=AlphaDropout, | |
dropout_rate=0.1, | |
kernel_initializer='lecun_normal', | |
optimizer='adam', | |
num_classes=1, | |
max_words=max_words): | |
"""Generic function to create a fully-connected neural network. | |
# Arguments | |
n_dense: int > 0. Number of dense layers. | |
dense_units: int > 0. Number of dense units per layer. | |
dropout: keras.layers.Layer. A dropout layer to apply. | |
dropout_rate: 0 <= float <= 1. The rate of dropout. | |
kernel_initializer: str. The initializer for the weights. | |
optimizer: str/keras.optimizers.Optimizer. The optimizer to use. | |
num_classes: int > 0. The number of classes to predict. | |
max_words: int > 0. The maximum number of words per data point. | |
# Returns | |
A Keras model instance (compiled). | |
""" | |
model = Sequential() | |
model.add( | |
Dense( | |
dense_units, | |
input_shape=(max_words, ), | |
kernel_initializer=kernel_initializer)) | |
model.add(Activation(activation)) | |
model.add(dropout(dropout_rate)) | |
for i in range(n_dense - 1): | |
model.add(Dense(dense_units, kernel_initializer=kernel_initializer)) | |
model.add(Activation(activation)) | |
model.add(dropout(dropout_rate)) | |
model.add(Dense(num_classes)) | |
model.add(Activation('softmax')) | |
model.compile( | |
loss='categorical_crossentropy', | |
optimizer=optimizer, | |
metrics=['accuracy']) | |
return model | |
network1 = { | |
'n_dense': 6, | |
'dense_units': 16, | |
'activation': 'relu', | |
'dropout': Dropout, | |
'dropout_rate': 0.5, | |
'kernel_initializer': 'glorot_uniform', | |
'optimizer': 'sgd' | |
} | |
network2 = { | |
'n_dense': 6, | |
'dense_units': 16, | |
'activation': 'selu', | |
'dropout': AlphaDropout, | |
'dropout_rate': 0.1, | |
'kernel_initializer': 'lecun_normal', | |
'optimizer': 'sgd' | |
} | |
network3 = { | |
'n_dense': 6, | |
'dense_units': 16, | |
'activation': lambda x: x * K.sigmoid(x), | |
'dropout': Dropout, | |
'dropout_rate': 0.5, | |
'kernel_initializer': 'he_normal', | |
'optimizer': 'rmsprop' | |
} | |
network4 = { | |
'n_dense': 6, | |
'dense_units': 16, | |
'activation': lambda x: 1.67653251702 * x * K.sigmoid(x), # calculated using selu method | |
'dropout': AlphaDropout, | |
'dropout_rate': 0.1, | |
'kernel_initializer': 'lecun_normal', | |
'optimizer': 'sgd' | |
} | |
print('Loading data...') | |
(x_train, y_train), (x_test, y_test) = reuters.load_data( | |
num_words=max_words, test_split=0.2) | |
print(len(x_train), 'train sequences') | |
print(len(x_test), 'test sequences') | |
num_classes = np.max(y_train) + 1 | |
print(num_classes, 'classes') | |
print('Vectorizing sequence data...') | |
tokenizer = Tokenizer(num_words=max_words) | |
x_train = tokenizer.sequences_to_matrix(x_train, mode='binary') | |
x_test = tokenizer.sequences_to_matrix(x_test, mode='binary') | |
print('x_train shape:', x_train.shape) | |
print('x_test shape:', x_test.shape) | |
print('Convert class vector to binary class matrix ' | |
'(for use with categorical_crossentropy)') | |
y_train = keras.utils.to_categorical(y_train, num_classes) | |
y_test = keras.utils.to_categorical(y_test, num_classes) | |
print('y_train shape:', y_train.shape) | |
print('y_test shape:', y_test.shape) | |
print('\nBuilding network 1 (RELU)...') | |
model1 = create_network(num_classes=num_classes, **network1) | |
history_model1 = model1.fit( | |
x_train, | |
y_train, | |
batch_size=batch_size, | |
epochs=epochs, | |
verbose=1, | |
validation_split=0.1) | |
score_model1 = model1.evaluate( | |
x_test, y_test, batch_size=batch_size, verbose=1) | |
print('\nBuilding network 2 (SELU)...') | |
model2 = create_network(num_classes=num_classes, **network2) | |
history_model2 = model2.fit( | |
x_train, | |
y_train, | |
batch_size=batch_size, | |
epochs=epochs, | |
verbose=1, | |
validation_split=0.1) | |
score_model2 = model2.evaluate( | |
x_test, y_test, batch_size=batch_size, verbose=1) | |
print('\nBuilding network 3 (Swish by Google Brain)...') | |
model3 = create_network(num_classes=num_classes, **network3) | |
history_model3 = model3.fit( | |
x_train, | |
y_train, | |
batch_size=batch_size, | |
epochs=epochs, | |
verbose=1, | |
validation_split=0.1) | |
score_model3 = model3.evaluate( | |
x_test, y_test, batch_size=batch_size, verbose=1) | |
print('\nBuilding network 4 (Scaled Swish...') | |
model4 = create_network(num_classes=num_classes, **network4) | |
history_model4 = model4.fit( | |
x_train, | |
y_train, | |
batch_size=batch_size, | |
epochs=epochs, | |
verbose=1, | |
validation_split=0.1) | |
score_model4 = model4.evaluate( | |
x_test, y_test, batch_size=batch_size, verbose=1) | |
print('\nNetwork 1 (RELU) results') | |
print('Hyperparameters:', network1) | |
print('Test score:', score_model1[0]) | |
print('Test accuracy:', score_model1[1]) | |
print('Network 2 (SELU) results') | |
print('Hyperparameters:', network2) | |
print('Test score:', score_model2[0]) | |
print('Test accuracy:', score_model2[1]) | |
print('Network 3 (Swish by Google Brain) results') | |
print('Hyperparameters:', network3) | |
print('Test score:', score_model3[0]) | |
print('Test accuracy:', score_model3[1]) | |
print('Network 4 (Scaled Swish) results') | |
print('Hyperparameters:', network4) | |
print('Test score:', score_model4[0]) | |
print('Test accuracy:', score_model4[1]) | |
plt.plot( | |
range(epochs), | |
history_model1.history['val_loss'], | |
'g-', | |
label='Network 1 (RELU) Val Loss') | |
plt.plot( | |
range(epochs), | |
history_model2.history['val_loss'], | |
'r-', | |
label='Network 2 (SELU) Val Loss') | |
plt.plot( | |
range(epochs), | |
history_model3.history['val_loss'], | |
'b-', | |
label='Network 3 (Swish) Val Loss') | |
plt.plot( | |
range(epochs), | |
history_model4.history['val_loss'], | |
'y-', | |
label='Network 4 (S-Swish) Val Loss') | |
plt.plot( | |
range(epochs), | |
history_model1.history['loss'], | |
'g--', | |
label='Network 1 (RELU) Loss') | |
plt.plot( | |
range(epochs), | |
history_model2.history['loss'], | |
'r--', | |
label='Network 2 (SELU) Loss') | |
plt.plot( | |
range(epochs), | |
history_model3.history['loss'], | |
'b--', | |
label='Network 3 (Swish) Loss') | |
plt.plot( | |
range(epochs), | |
history_model4.history['loss'], | |
'y--', | |
label='Network 4 (S-Swish) Loss') | |
plt.xlabel('Epochs') | |
plt.ylabel('Loss') | |
plt.legend() | |
plt.savefig('comparison_of_networks.png') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment