Skip to content

Instantly share code, notes, and snippets.

@arnaldog12
Last active July 24, 2020 20:24
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save arnaldog12/6b01d74ae47f530cfcc87aca50d1a2e1 to your computer and use it in GitHub Desktop.
Save arnaldog12/6b01d74ae47f530cfcc87aca50d1a2e1 to your computer and use it in GitHub Desktop.
Manual Prático do Deep Learning - Rede Neural
import numpy as np
import _pickle as pkl
# activation functions
def linear(x, derivative=False):
return np.ones_like(x) if derivative else x
def sigmoid(x, derivative=False):
if derivative:
y = sigmoid(x)
return y*(1 - y)
return 1.0/(1.0 + np.exp(-x))
def tanh(x, derivative=False):
if derivative:
y = tanh(x)
return 1 - y**2
return (np.exp(x) - np.exp(-x))/(np.exp(x) + np.exp(-x))
def relu(x, derivative=False):
if derivative:
return np.where(x <= 0, 0, 1)
return np.maximum(0, x)
def leaky_relu(x, derivative=False):
alpha = 0.1
if derivative:
return np.where(x <= 0, alpha, 1)
return np.where(x <= 0, alpha*x, x)
def elu(x, derivative=False):
alpha = 1.0
if derivative:
y = elu(x)
return np.where(x <= 0, y + alpha, 1)
return np.where(x <= 0, alpha*(np.exp(x) - 1), x)
# other functions
def softmax(x, y_oh=None, derivative=False):
if derivative:
y_pred = softmax(x)
y_correct = np.argmax(y_oh, axis=1)
pk = y_pred[range(y_pred.shape[0]), y_correct]
y_pred[range(y_pred.shape[0]), y_correct] = pk*(1.0 - pk)
return y_pred
exp = np.exp(x)
return exp/np.sum(exp, axis=1, keepdims=True)
def neg_log_likelihood(y_oh, y_pred, derivative=False):
y_correct = np.argmax(y_oh, axis=1)
pk = y_pred[range(y_pred.shape[0]), y_correct]
if derivative:
y_pred[range(y_pred.shape[0]), y_correct] = (-1.0/pk)
return y_pred
return np.mean(-np.log(pk))
# cost functions
def mae(y, y_pred, derivative=False):
if derivative:
return np.where(y_pred > y, 1, -1) / y.shape[0]
return np.mean(np.abs(y - y_pred))
def mse(y, y_pred, derivative=False):
if derivative:
return -(y - y_pred) / y.shape[0]
return 0.5*np.mean((y - y_pred)**2)
def binary_cross_entropy(y, y_pred, derivative=False):
if derivative:
return -(y - y_pred) / (y_pred * (1-y_pred) * y.shape[0])
return -np.mean(y*np.log(y_pred) + (1-y)*np.log(1-y_pred))
def sigmoid_cross_entropy(y, y_pred, derivative=False):
y_sigmoid = sigmoid(y_pred)
if derivative:
return -(y - y_sigmoid) / y.shape[0]
return -np.mean(y*np.log(y_sigmoid) + (1-y)*np.log(1-y_sigmoid))
def softmax_neg_log_likelihood(y_oh, y_pred, derivative=False):
y_softmax = softmax(y_pred)
y_correct = np.argmax(y_oh, axis=1)
pk = y_softmax[range(y_softmax.shape[0]), y_correct]
if derivative:
return -(y_oh - y_softmax)/y_oh.shape[0]
return np.mean(-np.log(pk))
# weights initialization
def zeros(rows, cols):
return np.zeros((rows, cols))
def ones(rows, cols):
return np.ones((rows, cols))
def random_normal(rows, cols):
return np.random.randn(rows, cols)
def random_uniform(rows, cols):
return np.random.rand(rows, cols)
def glorot_normal(rows, cols):
# normal com media=0 e stddev=sqrt(2.0 / (out + inp)). Ver notas de np.random.randn.
std_dev = np.sqrt(2.0 / (rows + cols))
return std_dev*np.random.randn(rows, cols)
def glorot_uniform(rows, cols):
# uniforme de [-limit, limit], onde limit = np.sqrt(6.0 / (out + inp))
limit = np.sqrt(6.0 / (rows + cols))
return 2*limit*np.random.rand(rows, cols) - limit
# regularization
def l1_regularization(weights, derivative=False):
if derivative:
weights = [np.where(w < 0, -1, w) for w in weights]
return np.array([np.where(w > 0, 1, w) for w in weights])
return np.sum([np.sum(np.abs(w)) for w in weights])
def l2_regularization(weights, derivative=False):
if derivative:
return weights
return 0.5 * np.sum(weights**2)
# batch generator
def batch_sequential(x, y, batch_size=None):
batch_size = x.shape[0] if batch_size is None else batch_size
n_batches = x.shape[0] // batch_size
for batch in range(n_batches):
offset = batch_size * batch
x_batch, y_batch = x[offset:offset+batch_size], y[offset:offset+batch_size]
yield (x_batch, y_batch)
def batch_shuffle(x, y, batch_size=None):
shuffle_index = np.random.permutation(range(x.shape[0]))
return batch_sequential(x[shuffle_index], y[shuffle_index], batch_size)
# learning rate decay
def none_decay(learning_rate, epoch, decay_rate, decay_step=1):
return learning_rate
def time_based_decay(learning_rate, epoch, decay_rate, decay_steps=1):
return 1.0 / (1 + decay_rate * epoch)
def exponential_decay(learning_rate, epoch, decay_rate, decay_steps=1):
return learning_rate * decay_rate**epoch
def staircase_decay(learning_rate, epoch, decay_rate, decay_steps=1):
return learning_rate * decay_rate**(epoch // decay_steps)
# batch normalization
def batchnorm_forward(layer, x, is_training=True):
mu = np.mean(x, axis=0) if is_training else layer._pop_mean
var = np.var(x, axis=0) if is_training else layer._pop_var
x_norm = (x - mu) / np.sqrt(var + 1e-8)
out = layer.gamma * x_norm + layer.beta
if is_training:
layer._pop_mean = layer.bn_decay * layer._pop_mean + (1.0-layer.bn_decay)*mu
layer._pop_var = layer.bn_decay * layer._pop_var + (1.0-layer.bn_decay)*var
layer._bn_cache = (x, x_norm, mu, var)
return out
def batchnorm_backward(layer, dactivation):
x, x_norm, mu, var = layer._bn_cache
m = layer._activ_inp.shape[0]
x_mu = x - mu
std_inv = 1. / np.sqrt(var + 1e-8)
dx_norm = dactivation * layer.gamma
dvar = np.sum(dx_norm * x_mu, axis=0) * -0.5 * (std_inv**3)
dmu = np.sum(dx_norm * -std_inv, axis=0) + dvar * np.mean(-2.0 * x_mu, axis=0)
dx = (dx_norm * std_inv) + (dvar * 2.0 * x_mu / m) + (dmu / m)
layer._dgamma = np.sum(dactivation * x_norm, axis=0)
layer._dbeta = np.sum(dactivation, axis=0)
return dx
# grad check
def __compute_approx_grads(nn, x, y, eps=1e-4):
approx_grads = []
feed_forward = lambda inp: nn._NeuralNetwork__feedforward(inp, is_training=True)
for layer in nn.layers:
assert(layer.dropout_prob == 0.0), "O Gradient Checking não pode ser aplicado em redes com DROPOUT"
w_ori = layer.weights.copy()
w_ravel = w_ori.ravel()
w_shape = w_ori.shape
for i in range(w_ravel.size):
w_plus = w_ravel.copy()
w_plus[i] += eps
layer.weights = w_plus.reshape(w_shape)
J_plus = nn.cost_func(y, feed_forward(x)) + (1.0/y.shape[0])*layer.reg_strength*layer.reg_func(layer.weights)
w_minus = w_ravel.copy()
w_minus[i] -= eps
layer.weights = w_minus.reshape(w_shape)
J_minus = nn.cost_func(y, feed_forward(x)) + (1.0/y.shape[0])*layer.reg_strength*layer.reg_func(layer.weights)
approx_grads.append((J_plus - J_minus) / (2.0*eps))
layer.weights = w_ori
return approx_grads
def gradient_checking(nn, x, y, eps=1e-4, verbose=False, verbose_precision=5):
from copy import deepcopy
nn_copy = deepcopy(nn)
nn.fit(x, y, epochs=0)
grads = np.concatenate([layer._dweights.ravel() for layer in nn.layers])
approx_grads = __compute_approx_grads(nn_copy, x, y, eps)
is_close = np.allclose(grads, approx_grads)
print("{}".format("\033[92mGRADIENTS OK" if is_close else "\033[91mGRADIENTS FAIL"))
norm_num = np.linalg.norm(grads - approx_grads)
norm_den = np.linalg.norm(grads) + np.linalg.norm(approx_grads)
error = norm_num / norm_den
print("Relative error:", error)
if verbose:
np.set_printoptions(precision=verbose_precision, linewidth=200, suppress=True)
print("Gradientes:", grads)
print("Aproximado:", np.array(approx_grads))
# implementation
class Layer():
def __init__(self, input_dim, output_dim, activation=linear, weights_initializer=random_normal, biases_initializer=ones, dropout_prob=0.0, reg_func=l2_regularization, reg_strength=0.0, batch_norm=False, bn_decay=0.9, is_trainable=True):
self.input = None
self.weights = weights_initializer(output_dim, input_dim)
self.biases = biases_initializer(1, output_dim)
self.activation = activation
self.dropout_prob = dropout_prob
self.reg_func = reg_func
self.reg_strength = reg_strength
self.batch_norm = batch_norm
self.bn_decay = bn_decay
self.gamma, self.beta = ones(1, output_dim), zeros(1, output_dim)
self.is_trainable = is_trainable
self._activ_inp, self._activ_out = None, None
self._dweights, self._dbiases, self._prev_dweights = None, None, 0.0
self._dropout_mask = None
self._dgamma, self._dbeta = None, None
self._pop_mean, self._pop_var = zeros(1, output_dim), zeros(1, output_dim)
self._bn_cache = None
class NeuralNetwork():
def __init__(self, cost_func=mse, learning_rate=1e-3, lr_decay_method=none_decay, lr_decay_rate=0.0, lr_decay_steps=1, momentum=0.0, patience=np.inf):
self.layers = []
self.cost_func = cost_func
self.learning_rate = self.lr_initial = learning_rate
self.lr_decay_method = lr_decay_method
self.lr_decay_rate = lr_decay_rate
self.lr_decay_steps = lr_decay_steps
self.momentum = momentum
self.patience, self.waiting = patience, 0
self._best_model, self._best_loss = self.layers, np.inf
def fit(self, x_train, y_train, x_val=None, y_val=None, epochs=100, verbose=10, batch_gen=batch_sequential, batch_size=None):
x_val, y_val = (x_train, y_train) if (x_val is None or y_val is None) else (x_val, y_val)
for epoch in range(epochs+1):
self.learning_rate = self.lr_decay_method(self.lr_initial, epoch, self.lr_decay_rate, self.lr_decay_steps)
for x_batch, y_batch in batch_gen(x_train, y_train, batch_size):
y_pred = self.__feedforward(x_batch)
self.__backprop(y_batch, y_pred)
loss_val = self.cost_func(y_val, self.predict(x_val))
if loss_val < self._best_loss:
self._best_model, self._best_loss = self.layers, loss_val
self.waiting = 0
else:
self.waiting += 1
if self.waiting >= self.patience:
self.layers = self._best_model
return
if epoch % verbose == 0:
loss_train = self.cost_func(y_train, self.predict(x_train))
loss_reg = (1.0/y_train.shape[0])*np.sum([layer.reg_strength * layer.reg_func(layer.weights) for layer in self.layers])
print("epoch: {0:=4}/{1} loss_train: {2:.8f} + {3:.8f} = {4:.8f} loss_val = {5:.8f}".format(epoch, epochs, loss_train, loss_reg, loss_train + loss_reg, loss_val))
def predict(self, x):
return self.__feedforward(x, is_training=False)
def save(self, file_path):
pkl.dump(self, open(file_path, 'wb'), -1)
def load(file_path):
return pkl.load(open(file_path, 'rb'))
def __feedforward(self, x, is_training=True):
self.layers[0].input = x
for current_layer, next_layer in zip(self.layers, self.layers[1:] + [Layer(0, 0)]):
y = np.dot(current_layer.input, current_layer.weights.T) + current_layer.biases
y = batchnorm_forward(current_layer, y, is_training) if current_layer.batch_norm else y
current_layer._dropout_mask = np.random.binomial(1, 1.0-current_layer.dropout_prob, y.shape) / (1.0-current_layer.dropout_prob)
current_layer._activ_inp = y
current_layer._activ_out = current_layer.activation(y) * (current_layer._dropout_mask if is_training else 1.0)
next_layer.input = current_layer._activ_out
return self.layers[-1]._activ_out
def __backprop(self, y, y_pred):
last_delta = self.cost_func(y, y_pred, derivative=True)
for layer in reversed(self.layers):
dactivation = layer.activation(layer._activ_inp, derivative=True) * last_delta * layer._dropout_mask
dactivation = batchnorm_backward(layer, dactivation) if layer.batch_norm else dactivation
last_delta = np.dot(dactivation, layer.weights)
layer._dweights = np.dot(dactivation.T, layer.input)
layer._dbiases = 1.0*dactivation.sum(axis=0, keepdims=True)
for layer in reversed(self.layers):
if layer.is_trainable:
layer._dweights = layer._dweights + (1.0/y.shape[0]) * layer.reg_strength * layer.reg_func(layer.weights, derivative=True)
layer._prev_dweights = -self.learning_rate*layer._dweights + self.momentum*layer._prev_dweights
layer.weights = layer.weights + layer._prev_dweights
layer.biases = layer.biases - self.learning_rate*layer._dbiases
if layer.batch_norm:
layer.gamma = layer.gamma - self.learning_rate*layer._dgamma
layer.beta = layer.beta - self.learning_rate*layer._dbeta
# example 1
print("------------ example 1 ------------")
x = np.array([[0.05, 0.10]])
y = np.array([[0.01, 0.99]])
D_in, D_out = x.shape[1], y.shape[1]
nn = NeuralNetwork(cost_func=mse, learning_rate=0.5)
nn.layers.append(Layer(input_dim=D_in, output_dim=2, activation=sigmoid))
nn.layers.append(Layer(input_dim=2, output_dim=D_out, activation=sigmoid))
w1 = np.array([[0.15, 0.20], [0.25, 0.30]])
b1 = np.array([[0.35]]) # destacar que eram para ser 2 bias - um para cada neurônio (1, 2)
w2 = np.array([[0.40, 0.45], [0.50, 0.55]])
b2 = np.array([[0.60]]) # destacar que eram para ser 2 bias - um para cada neurônio (1, 2)
nn.layers[0].weights = w1
nn.layers[0].biases = b1
nn.layers[1].weights = w2
nn.layers[1].biases = b2
nn.fit(x, y, epochs=0, verbose=1)
for layer in nn.layers:
print(layer.weights)
# example 2
print()
print("------------ example 2 ------------")
x = np.array([[0.1, 0.2, 0.7]])
y = np.array([[1, 0, 0]])
D_in, D_out = x.shape[1], y.shape[1]
nn = NeuralNetwork(cost_func=softmax_neg_log_likelihood, learning_rate=0.01)
nn.layers.append(Layer(input_dim=D_in, output_dim=3, activation=relu))
nn.layers.append(Layer(input_dim=3, output_dim=3, activation=sigmoid))
nn.layers.append(Layer(input_dim=3, output_dim=D_out, activation=linear))
w1 = np.array([[0.1, 0.2, 0.3], [0.3, 0.2, 0.7], [0.4, 0.3, 0.9]])
b1 = np.ones((1,3))
w2 = np.array([[0.2, 0.3, 0.5], [0.3, 0.5, 0.7], [0.6, 0.4, 0.8]])
b2 = np.ones((1,3))
w3 = np.array([[0.1, 0.4, 0.8], [0.3, 0.7, 0.2], [0.5, 0.2, 0.9]])
b3 = np.ones((1,3))
for i, w, b in zip(range(3), [w1, w2, w3], [b1, b2, b3]):
nn.layers[i].weights = w
nn.layers[i].biases = b
nn.fit(x, y, epochs=300, verbose=30)
for layer in nn.layers:
print(layer.weights)
nn.save('model.pkl')
# restart notebook and create new cell
nn = NeuralNetwork.load('model.pkl')
for layer in nn.layers:
print(layer.weights)
# gradient checking
print()
print("------------ grad. check ------------")
np.random.seed(1234)
N, D = 100, 2
x = np.random.rand(N, D)
y = np.random.rand(N, 1)
# regression
D_in, D_out = x.shape[1], y.shape[1]
nn = NeuralNetwork(cost_func=mse, learning_rate=1e-3, momentum=0.9, lr_decay_method=staircase_decay, lr_decay_rate=0.5, lr_decay_steps=10)
nn.layers.append(Layer(input_dim=D_in, output_dim=4, activation=relu, reg_func=l2_regularization, reg_strength=1.0, batch_norm=True))
nn.layers.append(Layer(input_dim=4, output_dim=1, activation=tanh, reg_func=l1_regularization, reg_strength=1e-4))
nn.layers.append(Layer(input_dim=1, output_dim=2, activation=sigmoid, reg_func=l1_regularization,reg_strength=1.0, batch_norm=True))
nn.layers.append(Layer(input_dim=2, output_dim=5, activation=leaky_relu, reg_func=l2_regularization, reg_strength=1e-2))
nn.layers.append(Layer(input_dim=5, output_dim=3, activation=elu, reg_func=l1_regularization, reg_strength=1e-3, batch_norm=True))
nn.layers.append(Layer(input_dim=3, output_dim=D_out, activation=linear, reg_func=l2_regularization, reg_strength=1e-3, batch_norm=True))
nn.fit(x, y, epochs=100)
gradient_checking(nn, x, y, eps=1e-4, verbose=True)
# binary classification
y = np.random.randint(0, 2, (N, 1))
D_in, D_out = x.shape[1], y.shape[1]
nn = NeuralNetwork(cost_func=sigmoid_cross_entropy, learning_rate=1e-3, momentum=0.9, lr_decay_method=staircase_decay, lr_decay_rate=0.5, lr_decay_steps=10)
nn.layers.append(Layer(input_dim=D_in, output_dim=4, activation=relu, reg_func=l2_regularization, reg_strength=1.0, batch_norm=True))
nn.layers.append(Layer(input_dim=4, output_dim=1, activation=tanh, reg_func=l1_regularization, reg_strength=1e-4))
nn.layers.append(Layer(input_dim=1, output_dim=2, activation=sigmoid, reg_func=l1_regularization,reg_strength=1.0, batch_norm=True))
nn.layers.append(Layer(input_dim=2, output_dim=5, activation=leaky_relu, reg_func=l2_regularization, reg_strength=1e-2))
nn.layers.append(Layer(input_dim=5, output_dim=3, activation=elu, reg_func=l1_regularization, reg_strength=1e-3, batch_norm=True))
nn.layers.append(Layer(input_dim=3, output_dim=D_out, activation=linear, reg_func=l2_regularization, reg_strength=1e-3, batch_norm=True))
nn.fit(x, y, epochs=100)
gradient_checking(nn, x, y, eps=1e-4, verbose=False)
# multiclass classification
from sklearn.preprocessing import OneHotEncoder
y = np.random.randint(0, 2, (N, 1))
y_oh = OneHotEncoder(sparse=False).fit_transform(y)
D_in, D_out = x.shape[1], y_oh.shape[1]
nn = NeuralNetwork(cost_func=softmax_neg_log_likelihood, learning_rate=1e-3, momentum=0.9, lr_decay_method=staircase_decay, lr_decay_rate=0.5, lr_decay_steps=10)
nn.layers.append(Layer(input_dim=D_in, output_dim=4, activation=relu, reg_func=l2_regularization, reg_strength=1.0, batch_norm=True))
nn.layers.append(Layer(input_dim=4, output_dim=1, activation=tanh, reg_func=l1_regularization, reg_strength=1e-4))
nn.layers.append(Layer(input_dim=1, output_dim=2, activation=sigmoid, reg_func=l1_regularization,reg_strength=1.0, batch_norm=True))
nn.layers.append(Layer(input_dim=2, output_dim=5, activation=leaky_relu, reg_func=l2_regularization, reg_strength=1e-2))
nn.layers.append(Layer(input_dim=5, output_dim=3, activation=elu, reg_func=l1_regularization, reg_strength=1e-3, batch_norm=True))
nn.layers.append(Layer(input_dim=3, output_dim=D_out, activation=linear, reg_func=l2_regularization, reg_strength=1e-3, batch_norm=True))
nn.fit(x, y, epochs=100)
gradient_checking(nn, x, y_oh, eps=1e-4, verbose=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment