arnaldog12/neural_network.py

## neural_network.py
import numpy as np
import _pickle as pkl

# activation functions
def linear(x, derivative=False):
    return np.ones_like(x) if derivative else x

def sigmoid(x, derivative=False):
    if derivative:
        y = sigmoid(x)
        return y*(1 - y)
    return 1.0/(1.0 + np.exp(-x))

def tanh(x, derivative=False):
    if derivative:
        y = tanh(x)
        return 1 - y**2
    return (np.exp(x) - np.exp(-x))/(np.exp(x) + np.exp(-x))

def relu(x, derivative=False):
    if derivative:
        return np.where(x <= 0, 0, 1)
    return np.maximum(0, x)

def leaky_relu(x, derivative=False):
    alpha = 0.1
    if derivative:
        return np.where(x <= 0, alpha, 1)
    return np.where(x <= 0, alpha*x, x)

def elu(x, derivative=False):
    alpha = 1.0
    if derivative:
        y = elu(x)
        return np.where(x <= 0, y + alpha, 1)
    return np.where(x <= 0, alpha*(np.exp(x) - 1), x)

# other functions
def softmax(x, y_oh=None, derivative=False):
    if derivative:
        y_pred = softmax(x)
        y_correct = np.argmax(y_oh, axis=1)
        pk = y_pred[range(y_pred.shape[0]), y_correct]
        y_pred[range(y_pred.shape[0]), y_correct] = pk*(1.0 - pk)
        return y_pred
    exp = np.exp(x)
    return exp/np.sum(exp, axis=1, keepdims=True)

def neg_log_likelihood(y_oh, y_pred, derivative=False):
    y_correct = np.argmax(y_oh, axis=1)
    pk = y_pred[range(y_pred.shape[0]), y_correct]
    if derivative:
        y_pred[range(y_pred.shape[0]), y_correct] = (-1.0/pk)
        return y_pred
    return np.mean(-np.log(pk))

# cost functions
def mae(y, y_pred, derivative=False):
    if derivative:
        return np.where(y_pred > y, 1, -1) / y.shape[0]
    return np.mean(np.abs(y - y_pred))

def mse(y, y_pred, derivative=False):
    if derivative:
        return -(y - y_pred) / y.shape[0]
    return 0.5*np.mean((y - y_pred)**2)

def binary_cross_entropy(y, y_pred, derivative=False):
    if derivative:
        return -(y - y_pred) / (y_pred * (1-y_pred) * y.shape[0])
    return -np.mean(y*np.log(y_pred) + (1-y)*np.log(1-y_pred))

def sigmoid_cross_entropy(y, y_pred, derivative=False):
    y_sigmoid = sigmoid(y_pred)
    if derivative:
        return -(y - y_sigmoid) / y.shape[0]
    return -np.mean(y*np.log(y_sigmoid) + (1-y)*np.log(1-y_sigmoid))

def softmax_neg_log_likelihood(y_oh, y_pred, derivative=False):
    y_softmax = softmax(y_pred)
    y_correct = np.argmax(y_oh, axis=1)
    pk = y_softmax[range(y_softmax.shape[0]), y_correct]
    if derivative:
        return -(y_oh - y_softmax)/y_oh.shape[0]
    return np.mean(-np.log(pk))

# weights initialization
def zeros(rows, cols):
    return np.zeros((rows, cols))

def ones(rows, cols):
    return np.ones((rows, cols))

def random_normal(rows, cols):
    return np.random.randn(rows, cols)

def random_uniform(rows, cols):
    return np.random.rand(rows, cols)

def glorot_normal(rows, cols):
    # normal com media=0 e stddev=sqrt(2.0 / (out + inp)). Ver notas de np.random.randn.
    std_dev = np.sqrt(2.0 / (rows + cols))
    return std_dev*np.random.randn(rows, cols)

def glorot_uniform(rows, cols):
    # uniforme de [-limit, limit], onde limit = np.sqrt(6.0 / (out + inp))
    limit = np.sqrt(6.0 / (rows + cols))
    return 2*limit*np.random.rand(rows, cols) - limit

# regularization
def l1_regularization(weights, derivative=False):
    if derivative:
        weights = [np.where(w < 0, -1, w) for w in weights]
        return np.array([np.where(w > 0, 1, w) for w in weights])
    return np.sum([np.sum(np.abs(w)) for w in weights])

def l2_regularization(weights, derivative=False):
    if derivative:
        return weights
    return 0.5 * np.sum(weights**2)

# batch generator
def batch_sequential(x, y, batch_size=None):
    batch_size = x.shape[0] if batch_size is None else batch_size
    n_batches = x.shape[0] // batch_size

    for batch in range(n_batches):
        offset = batch_size * batch
        x_batch, y_batch = x[offset:offset+batch_size], y[offset:offset+batch_size]
        yield (x_batch, y_batch)

def batch_shuffle(x, y, batch_size=None):
    shuffle_index = np.random.permutation(range(x.shape[0]))
    return batch_sequential(x[shuffle_index], y[shuffle_index], batch_size)

# learning rate decay
def none_decay(learning_rate, epoch, decay_rate, decay_step=1):
    return learning_rate

def time_based_decay(learning_rate, epoch, decay_rate, decay_steps=1):
    return 1.0 / (1 + decay_rate * epoch)

def exponential_decay(learning_rate, epoch, decay_rate, decay_steps=1):
    return learning_rate * decay_rate**epoch

def staircase_decay(learning_rate, epoch, decay_rate, decay_steps=1):
    return learning_rate * decay_rate**(epoch // decay_steps)

# batch normalization
def batchnorm_forward(layer, x, is_training=True):
    mu = np.mean(x, axis=0) if is_training else layer._pop_mean
    var = np.var(x, axis=0) if is_training else layer._pop_var
    x_norm = (x - mu) / np.sqrt(var + 1e-8)
    out = layer.gamma * x_norm + layer.beta

    if is_training:
        layer._pop_mean = layer.bn_decay * layer._pop_mean + (1.0-layer.bn_decay)*mu
        layer._pop_var = layer.bn_decay * layer._pop_var + (1.0-layer.bn_decay)*var
        layer._bn_cache = (x, x_norm, mu, var)
    return out

def batchnorm_backward(layer, dactivation):
    x, x_norm, mu, var = layer._bn_cache

    m = layer._activ_inp.shape[0]
    x_mu = x - mu
    std_inv = 1. / np.sqrt(var + 1e-8)

    dx_norm = dactivation * layer.gamma
    dvar = np.sum(dx_norm * x_mu, axis=0) * -0.5 * (std_inv**3)
    dmu = np.sum(dx_norm * -std_inv, axis=0) + dvar * np.mean(-2.0 * x_mu, axis=0)

    dx = (dx_norm * std_inv) + (dvar * 2.0 * x_mu / m) + (dmu / m)
    layer._dgamma = np.sum(dactivation * x_norm, axis=0)
    layer._dbeta = np.sum(dactivation, axis=0)
    return dx

# grad check
def __compute_approx_grads(nn, x, y, eps=1e-4):
    approx_grads = []
    feed_forward = lambda inp: nn._NeuralNetwork__feedforward(inp, is_training=True)

    for layer in nn.layers:
        assert(layer.dropout_prob == 0.0), "O Gradient Checking não pode ser aplicado em redes com DROPOUT"

        w_ori = layer.weights.copy()
        w_ravel = w_ori.ravel()
        w_shape = w_ori.shape

        for i in range(w_ravel.size):
            w_plus = w_ravel.copy()
            w_plus[i] += eps
            layer.weights = w_plus.reshape(w_shape)
            J_plus = nn.cost_func(y, feed_forward(x)) + (1.0/y.shape[0])*layer.reg_strength*layer.reg_func(layer.weights)

            w_minus = w_ravel.copy()
            w_minus[i] -= eps
            layer.weights = w_minus.reshape(w_shape)
            J_minus = nn.cost_func(y, feed_forward(x)) + (1.0/y.shape[0])*layer.reg_strength*layer.reg_func(layer.weights)
            approx_grads.append((J_plus - J_minus) / (2.0*eps))
        layer.weights = w_ori

    return approx_grads

def gradient_checking(nn, x, y, eps=1e-4, verbose=False, verbose_precision=5):
    from copy import deepcopy
    nn_copy = deepcopy(nn)

    nn.fit(x, y, epochs=0)
    grads = np.concatenate([layer._dweights.ravel() for layer in nn.layers])

    approx_grads = __compute_approx_grads(nn_copy, x, y, eps)

    is_close = np.allclose(grads, approx_grads)
    print("{}".format("\033[92mGRADIENTS OK" if is_close else "\033[91mGRADIENTS FAIL"))

    norm_num = np.linalg.norm(grads - approx_grads)
    norm_den = np.linalg.norm(grads) + np.linalg.norm(approx_grads)
    error = norm_num / norm_den
    print("Relative error:", error)

    if verbose:
        np.set_printoptions(precision=verbose_precision, linewidth=200, suppress=True)
        print("Gradientes:", grads)
        print("Aproximado:", np.array(approx_grads))

# implementation
class Layer():
    def __init__(self, input_dim, output_dim, activation=linear, weights_initializer=random_normal, biases_initializer=ones, dropout_prob=0.0, reg_func=l2_regularization, reg_strength=0.0, batch_norm=False, bn_decay=0.9, is_trainable=True):
        self.input = None
        self.weights = weights_initializer(output_dim, input_dim)
        self.biases = biases_initializer(1, output_dim)
        self.activation = activation
        self.dropout_prob = dropout_prob
        self.reg_func = reg_func
        self.reg_strength = reg_strength
        self.batch_norm = batch_norm
        self.bn_decay = bn_decay
        self.gamma, self.beta = ones(1, output_dim), zeros(1, output_dim)
        self.is_trainable = is_trainable

        self._activ_inp, self._activ_out = None, None
        self._dweights, self._dbiases, self._prev_dweights = None, None, 0.0
        self._dropout_mask = None
        self._dgamma, self._dbeta = None, None
        self._pop_mean, self._pop_var = zeros(1, output_dim), zeros(1, output_dim)
        self._bn_cache = None

class NeuralNetwork():
    def __init__(self, cost_func=mse, learning_rate=1e-3, lr_decay_method=none_decay, lr_decay_rate=0.0, lr_decay_steps=1, momentum=0.0, patience=np.inf):
        self.layers = []
        self.cost_func = cost_func
        self.learning_rate = self.lr_initial = learning_rate
        self.lr_decay_method = lr_decay_method
        self.lr_decay_rate = lr_decay_rate
        self.lr_decay_steps = lr_decay_steps
        self.momentum = momentum
        self.patience, self.waiting = patience, 0
        self._best_model, self._best_loss = self.layers, np.inf

    def fit(self, x_train, y_train, x_val=None, y_val=None, epochs=100, verbose=10, batch_gen=batch_sequential, batch_size=None):
        x_val, y_val = (x_train, y_train) if (x_val is None or y_val is None) else (x_val, y_val)

        for epoch in range(epochs+1):
            self.learning_rate = self.lr_decay_method(self.lr_initial, epoch, self.lr_decay_rate, self.lr_decay_steps)

            for x_batch, y_batch in batch_gen(x_train, y_train, batch_size):
                y_pred = self.__feedforward(x_batch)
                self.__backprop(y_batch, y_pred)

            loss_val = self.cost_func(y_val, self.predict(x_val))
            if loss_val < self._best_loss:
                self._best_model, self._best_loss = self.layers, loss_val
                self.waiting = 0
            else:
                self.waiting += 1
                if self.waiting >= self.patience:
                    self.layers = self._best_model
                    return

            if epoch % verbose == 0:
                loss_train = self.cost_func(y_train, self.predict(x_train))
                loss_reg = (1.0/y_train.shape[0])*np.sum([layer.reg_strength * layer.reg_func(layer.weights) for layer in self.layers])
                print("epoch: {0:=4}/{1} loss_train: {2:.8f} + {3:.8f} = {4:.8f} loss_val = {5:.8f}".format(epoch, epochs, loss_train, loss_reg, loss_train + loss_reg, loss_val))

    def predict(self, x):
        return self.__feedforward(x, is_training=False)

    def save(self, file_path):
        pkl.dump(self, open(file_path, 'wb'), -1)

    def load(file_path):
        return pkl.load(open(file_path, 'rb'))

    def __feedforward(self, x, is_training=True):
        self.layers[0].input = x
        for current_layer, next_layer in zip(self.layers, self.layers[1:] + [Layer(0, 0)]):
            y = np.dot(current_layer.input, current_layer.weights.T) + current_layer.biases
            y = batchnorm_forward(current_layer, y, is_training) if current_layer.batch_norm else y
            current_layer._dropout_mask = np.random.binomial(1, 1.0-current_layer.dropout_prob, y.shape) / (1.0-current_layer.dropout_prob)
            current_layer._activ_inp = y
            current_layer._activ_out = current_layer.activation(y) * (current_layer._dropout_mask if is_training else 1.0)
            next_layer.input = current_layer._activ_out
        return self.layers[-1]._activ_out

    def __backprop(self, y, y_pred):
        last_delta = self.cost_func(y, y_pred, derivative=True)
        for layer in reversed(self.layers):
            dactivation = layer.activation(layer._activ_inp, derivative=True) * last_delta * layer._dropout_mask
            dactivation = batchnorm_backward(layer, dactivation) if layer.batch_norm else dactivation
            last_delta = np.dot(dactivation, layer.weights)
            layer._dweights = np.dot(dactivation.T, layer.input)
            layer._dbiases = 1.0*dactivation.sum(axis=0, keepdims=True)

        for layer in reversed(self.layers):
            if layer.is_trainable:
                layer._dweights = layer._dweights + (1.0/y.shape[0]) * layer.reg_strength * layer.reg_func(layer.weights, derivative=True)
                layer._prev_dweights = -self.learning_rate*layer._dweights + self.momentum*layer._prev_dweights
                layer.weights = layer.weights + layer._prev_dweights
                layer.biases = layer.biases - self.learning_rate*layer._dbiases
                if layer.batch_norm:
                    layer.gamma = layer.gamma - self.learning_rate*layer._dgamma
                    layer.beta = layer.beta - self.learning_rate*layer._dbeta

# example 1
print("------------ example 1 ------------")
x = np.array([[0.05, 0.10]])
y = np.array([[0.01, 0.99]])

D_in, D_out = x.shape[1], y.shape[1]
nn = NeuralNetwork(cost_func=mse, learning_rate=0.5)
nn.layers.append(Layer(input_dim=D_in, output_dim=2, activation=sigmoid))
nn.layers.append(Layer(input_dim=2, output_dim=D_out, activation=sigmoid))

w1 = np.array([[0.15, 0.20], [0.25, 0.30]])
b1 = np.array([[0.35]]) # destacar que eram para ser 2 bias - um para cada neurônio (1, 2)
w2 = np.array([[0.40, 0.45], [0.50, 0.55]])
b2 = np.array([[0.60]]) # destacar que eram para ser 2 bias - um para cada neurônio (1, 2)

nn.layers[0].weights = w1
nn.layers[0].biases = b1
nn.layers[1].weights = w2
nn.layers[1].biases = b2

nn.fit(x, y, epochs=0, verbose=1)

for layer in nn.layers:
    print(layer.weights)

# example 2
print()
print("------------ example 2 ------------")
x = np.array([[0.1, 0.2, 0.7]])
y = np.array([[1, 0, 0]])
D_in, D_out = x.shape[1], y.shape[1]

nn = NeuralNetwork(cost_func=softmax_neg_log_likelihood, learning_rate=0.01)
nn.layers.append(Layer(input_dim=D_in, output_dim=3, activation=relu))
nn.layers.append(Layer(input_dim=3, output_dim=3, activation=sigmoid))
nn.layers.append(Layer(input_dim=3, output_dim=D_out, activation=linear))

w1 = np.array([[0.1, 0.2, 0.3], [0.3, 0.2, 0.7], [0.4, 0.3, 0.9]])
b1 = np.ones((1,3))
w2 = np.array([[0.2, 0.3, 0.5], [0.3, 0.5, 0.7], [0.6, 0.4, 0.8]])
b2 = np.ones((1,3))
w3 = np.array([[0.1, 0.4, 0.8], [0.3, 0.7, 0.2], [0.5, 0.2, 0.9]])
b3 = np.ones((1,3))

for i, w, b in zip(range(3), [w1, w2, w3], [b1, b2, b3]):
    nn.layers[i].weights = w
    nn.layers[i].biases = b

nn.fit(x, y, epochs=300, verbose=30)
for layer in nn.layers:
    print(layer.weights)

nn.save('model.pkl')

# restart notebook and create new cell
nn = NeuralNetwork.load('model.pkl')
for layer in nn.layers:
    print(layer.weights)

# gradient checking
print()
print("------------ grad. check ------------")
np.random.seed(1234)
N, D = 100, 2
x = np.random.rand(N, D)
y = np.random.rand(N, 1)

# regression
D_in, D_out = x.shape[1], y.shape[1]

nn = NeuralNetwork(cost_func=mse, learning_rate=1e-3, momentum=0.9, lr_decay_method=staircase_decay, lr_decay_rate=0.5, lr_decay_steps=10)
nn.layers.append(Layer(input_dim=D_in, output_dim=4, activation=relu, reg_func=l2_regularization, reg_strength=1.0, batch_norm=True))
nn.layers.append(Layer(input_dim=4, output_dim=1, activation=tanh, reg_func=l1_regularization, reg_strength=1e-4))
nn.layers.append(Layer(input_dim=1, output_dim=2, activation=sigmoid, reg_func=l1_regularization,reg_strength=1.0, batch_norm=True))
nn.layers.append(Layer(input_dim=2, output_dim=5, activation=leaky_relu, reg_func=l2_regularization, reg_strength=1e-2))
nn.layers.append(Layer(input_dim=5, output_dim=3, activation=elu, reg_func=l1_regularization, reg_strength=1e-3, batch_norm=True))
nn.layers.append(Layer(input_dim=3, output_dim=D_out, activation=linear, reg_func=l2_regularization, reg_strength=1e-3, batch_norm=True))

nn.fit(x, y, epochs=100)
gradient_checking(nn, x, y, eps=1e-4, verbose=True)

# binary classification
y = np.random.randint(0, 2, (N, 1))
D_in, D_out = x.shape[1], y.shape[1]

nn = NeuralNetwork(cost_func=sigmoid_cross_entropy, learning_rate=1e-3, momentum=0.9, lr_decay_method=staircase_decay, lr_decay_rate=0.5, lr_decay_steps=10)
nn.layers.append(Layer(input_dim=D_in, output_dim=4, activation=relu, reg_func=l2_regularization, reg_strength=1.0, batch_norm=True))
nn.layers.append(Layer(input_dim=4, output_dim=1, activation=tanh, reg_func=l1_regularization, reg_strength=1e-4))
nn.layers.append(Layer(input_dim=1, output_dim=2, activation=sigmoid, reg_func=l1_regularization,reg_strength=1.0, batch_norm=True))
nn.layers.append(Layer(input_dim=2, output_dim=5, activation=leaky_relu, reg_func=l2_regularization, reg_strength=1e-2))
nn.layers.append(Layer(input_dim=5, output_dim=3, activation=elu, reg_func=l1_regularization, reg_strength=1e-3, batch_norm=True))
nn.layers.append(Layer(input_dim=3, output_dim=D_out, activation=linear, reg_func=l2_regularization, reg_strength=1e-3, batch_norm=True))

nn.fit(x, y, epochs=100)
gradient_checking(nn, x, y, eps=1e-4, verbose=False)

# multiclass classification
from sklearn.preprocessing import OneHotEncoder

y = np.random.randint(0, 2, (N, 1))
y_oh = OneHotEncoder(sparse=False).fit_transform(y)
D_in, D_out = x.shape[1], y_oh.shape[1]

nn = NeuralNetwork(cost_func=softmax_neg_log_likelihood, learning_rate=1e-3, momentum=0.9, lr_decay_method=staircase_decay, lr_decay_rate=0.5, lr_decay_steps=10)
nn.layers.append(Layer(input_dim=D_in, output_dim=4, activation=relu, reg_func=l2_regularization, reg_strength=1.0, batch_norm=True))
nn.layers.append(Layer(input_dim=4, output_dim=1, activation=tanh, reg_func=l1_regularization, reg_strength=1e-4))
nn.layers.append(Layer(input_dim=1, output_dim=2, activation=sigmoid, reg_func=l1_regularization,reg_strength=1.0, batch_norm=True))
nn.layers.append(Layer(input_dim=2, output_dim=5, activation=leaky_relu, reg_func=l2_regularization, reg_strength=1e-2))
nn.layers.append(Layer(input_dim=5, output_dim=3, activation=elu, reg_func=l1_regularization, reg_strength=1e-3, batch_norm=True))
nn.layers.append(Layer(input_dim=3, output_dim=D_out, activation=linear, reg_func=l2_regularization, reg_strength=1e-3, batch_norm=True))

nn.fit(x, y, epochs=100)
gradient_checking(nn, x, y_oh, eps=1e-4, verbose=False)
	import numpy as np
	import _pickle as pkl

	# activation functions
	def linear(x, derivative=False):
	return np.ones_like(x) if derivative else x

	def sigmoid(x, derivative=False):
	if derivative:
	y = sigmoid(x)
	return y*(1 - y)
	return 1.0/(1.0 + np.exp(-x))

	def tanh(x, derivative=False):
	if derivative:
	y = tanh(x)
	return 1 - y**2
	return (np.exp(x) - np.exp(-x))/(np.exp(x) + np.exp(-x))

	def relu(x, derivative=False):
	if derivative:
	return np.where(x <= 0, 0, 1)
	return np.maximum(0, x)

	def leaky_relu(x, derivative=False):
	alpha = 0.1
	if derivative:
	return np.where(x <= 0, alpha, 1)
	return np.where(x <= 0, alpha*x, x)

	def elu(x, derivative=False):
	alpha = 1.0
	if derivative:
	y = elu(x)
	return np.where(x <= 0, y + alpha, 1)
	return np.where(x <= 0, alpha*(np.exp(x) - 1), x)

	# other functions
	def softmax(x, y_oh=None, derivative=False):
	if derivative:
	y_pred = softmax(x)
	y_correct = np.argmax(y_oh, axis=1)
	pk = y_pred[range(y_pred.shape[0]), y_correct]
	y_pred[range(y_pred.shape[0]), y_correct] = pk*(1.0 - pk)
	return y_pred
	exp = np.exp(x)
	return exp/np.sum(exp, axis=1, keepdims=True)

	def neg_log_likelihood(y_oh, y_pred, derivative=False):
	y_correct = np.argmax(y_oh, axis=1)
	pk = y_pred[range(y_pred.shape[0]), y_correct]
	if derivative:
	y_pred[range(y_pred.shape[0]), y_correct] = (-1.0/pk)
	return y_pred
	return np.mean(-np.log(pk))

	# cost functions
	def mae(y, y_pred, derivative=False):
	if derivative:
	return np.where(y_pred > y, 1, -1) / y.shape[0]
	return np.mean(np.abs(y - y_pred))

	def mse(y, y_pred, derivative=False):
	if derivative:
	return -(y - y_pred) / y.shape[0]
	return 0.5np.mean((y - y_pred)*2)

	def binary_cross_entropy(y, y_pred, derivative=False):
	if derivative:
	return -(y - y_pred) / (y_pred * (1-y_pred) * y.shape[0])
	return -np.mean(ynp.log(y_pred) + (1-y)np.log(1-y_pred))

	def sigmoid_cross_entropy(y, y_pred, derivative=False):
	y_sigmoid = sigmoid(y_pred)
	if derivative:
	return -(y - y_sigmoid) / y.shape[0]
	return -np.mean(ynp.log(y_sigmoid) + (1-y)np.log(1-y_sigmoid))

	def softmax_neg_log_likelihood(y_oh, y_pred, derivative=False):
	y_softmax = softmax(y_pred)
	y_correct = np.argmax(y_oh, axis=1)
	pk = y_softmax[range(y_softmax.shape[0]), y_correct]
	if derivative:
	return -(y_oh - y_softmax)/y_oh.shape[0]
	return np.mean(-np.log(pk))

	# weights initialization
	def zeros(rows, cols):
	return np.zeros((rows, cols))

	def ones(rows, cols):
	return np.ones((rows, cols))

	def random_normal(rows, cols):
	return np.random.randn(rows, cols)

	def random_uniform(rows, cols):
	return np.random.rand(rows, cols)

	def glorot_normal(rows, cols):
	# normal com media=0 e stddev=sqrt(2.0 / (out + inp)). Ver notas de np.random.randn.
	std_dev = np.sqrt(2.0 / (rows + cols))
	return std_dev*np.random.randn(rows, cols)

	def glorot_uniform(rows, cols):
	# uniforme de [-limit, limit], onde limit = np.sqrt(6.0 / (out + inp))
	limit = np.sqrt(6.0 / (rows + cols))
	return 2limitnp.random.rand(rows, cols) - limit

	# regularization
	def l1_regularization(weights, derivative=False):
	if derivative:
	weights = [np.where(w < 0, -1, w) for w in weights]
	return np.array([np.where(w > 0, 1, w) for w in weights])
	return np.sum([np.sum(np.abs(w)) for w in weights])

	def l2_regularization(weights, derivative=False):
	if derivative:
	return weights
	return 0.5 * np.sum(weights**2)

	# batch generator
	def batch_sequential(x, y, batch_size=None):
	batch_size = x.shape[0] if batch_size is None else batch_size
	n_batches = x.shape[0] // batch_size

	for batch in range(n_batches):
	offset = batch_size * batch
	x_batch, y_batch = x[offset:offset+batch_size], y[offset:offset+batch_size]
	yield (x_batch, y_batch)

	def batch_shuffle(x, y, batch_size=None):
	shuffle_index = np.random.permutation(range(x.shape[0]))
	return batch_sequential(x[shuffle_index], y[shuffle_index], batch_size)

	# learning rate decay
	def none_decay(learning_rate, epoch, decay_rate, decay_step=1):
	return learning_rate

	def time_based_decay(learning_rate, epoch, decay_rate, decay_steps=1):
	return 1.0 / (1 + decay_rate * epoch)

	def exponential_decay(learning_rate, epoch, decay_rate, decay_steps=1):
	return learning_rate * decay_rate**epoch

	def staircase_decay(learning_rate, epoch, decay_rate, decay_steps=1):
	return learning_rate * decay_rate**(epoch // decay_steps)

	# batch normalization
	def batchnorm_forward(layer, x, is_training=True):
	mu = np.mean(x, axis=0) if is_training else layer._pop_mean
	var = np.var(x, axis=0) if is_training else layer._pop_var
	x_norm = (x - mu) / np.sqrt(var + 1e-8)
	out = layer.gamma * x_norm + layer.beta

	if is_training:
	layer._pop_mean = layer.bn_decay * layer._pop_mean + (1.0-layer.bn_decay)*mu
	layer._pop_var = layer.bn_decay * layer._pop_var + (1.0-layer.bn_decay)*var
	layer._bn_cache = (x, x_norm, mu, var)
	return out

	def batchnorm_backward(layer, dactivation):
	x, x_norm, mu, var = layer._bn_cache

	m = layer._activ_inp.shape[0]
	x_mu = x - mu
	std_inv = 1. / np.sqrt(var + 1e-8)

	dx_norm = dactivation * layer.gamma
	dvar = np.sum(dx_norm * x_mu, axis=0) * -0.5 * (std_inv**3)
	dmu = np.sum(dx_norm * -std_inv, axis=0) + dvar * np.mean(-2.0 * x_mu, axis=0)

	dx = (dx_norm * std_inv) + (dvar * 2.0 * x_mu / m) + (dmu / m)
	layer._dgamma = np.sum(dactivation * x_norm, axis=0)
	layer._dbeta = np.sum(dactivation, axis=0)
	return dx

	# grad check
	def __compute_approx_grads(nn, x, y, eps=1e-4):
	approx_grads = []
	feed_forward = lambda inp: nn._NeuralNetwork__feedforward(inp, is_training=True)

	for layer in nn.layers:
	assert(layer.dropout_prob == 0.0), "O Gradient Checking não pode ser aplicado em redes com DROPOUT"

	w_ori = layer.weights.copy()
	w_ravel = w_ori.ravel()
	w_shape = w_ori.shape

	for i in range(w_ravel.size):
	w_plus = w_ravel.copy()
	w_plus[i] += eps
	layer.weights = w_plus.reshape(w_shape)
	J_plus = nn.cost_func(y, feed_forward(x)) + (1.0/y.shape[0])layer.reg_strengthlayer.reg_func(layer.weights)

	w_minus = w_ravel.copy()
	w_minus[i] -= eps
	layer.weights = w_minus.reshape(w_shape)
	J_minus = nn.cost_func(y, feed_forward(x)) + (1.0/y.shape[0])layer.reg_strengthlayer.reg_func(layer.weights)
	approx_grads.append((J_plus - J_minus) / (2.0*eps))
	layer.weights = w_ori

	return approx_grads

	def gradient_checking(nn, x, y, eps=1e-4, verbose=False, verbose_precision=5):
	from copy import deepcopy
	nn_copy = deepcopy(nn)

	nn.fit(x, y, epochs=0)
	grads = np.concatenate([layer._dweights.ravel() for layer in nn.layers])

	approx_grads = __compute_approx_grads(nn_copy, x, y, eps)

	is_close = np.allclose(grads, approx_grads)
	print("{}".format("\033[92mGRADIENTS OK" if is_close else "\033[91mGRADIENTS FAIL"))

	norm_num = np.linalg.norm(grads - approx_grads)
	norm_den = np.linalg.norm(grads) + np.linalg.norm(approx_grads)
	error = norm_num / norm_den
	print("Relative error:", error)

	if verbose:
	np.set_printoptions(precision=verbose_precision, linewidth=200, suppress=True)
	print("Gradientes:", grads)
	print("Aproximado:", np.array(approx_grads))

	# implementation
	class Layer():
	def __init__(self, input_dim, output_dim, activation=linear, weights_initializer=random_normal, biases_initializer=ones, dropout_prob=0.0, reg_func=l2_regularization, reg_strength=0.0, batch_norm=False, bn_decay=0.9, is_trainable=True):
	self.input = None
	self.weights = weights_initializer(output_dim, input_dim)
	self.biases = biases_initializer(1, output_dim)
	self.activation = activation
	self.dropout_prob = dropout_prob
	self.reg_func = reg_func
	self.reg_strength = reg_strength
	self.batch_norm = batch_norm
	self.bn_decay = bn_decay
	self.gamma, self.beta = ones(1, output_dim), zeros(1, output_dim)
	self.is_trainable = is_trainable

	self._activ_inp, self._activ_out = None, None
	self._dweights, self._dbiases, self._prev_dweights = None, None, 0.0
	self._dropout_mask = None
	self._dgamma, self._dbeta = None, None
	self._pop_mean, self._pop_var = zeros(1, output_dim), zeros(1, output_dim)
	self._bn_cache = None

	class NeuralNetwork():
	def __init__(self, cost_func=mse, learning_rate=1e-3, lr_decay_method=none_decay, lr_decay_rate=0.0, lr_decay_steps=1, momentum=0.0, patience=np.inf):
	self.layers = []
	self.cost_func = cost_func
	self.learning_rate = self.lr_initial = learning_rate
	self.lr_decay_method = lr_decay_method
	self.lr_decay_rate = lr_decay_rate
	self.lr_decay_steps = lr_decay_steps
	self.momentum = momentum
	self.patience, self.waiting = patience, 0
	self._best_model, self._best_loss = self.layers, np.inf

	def fit(self, x_train, y_train, x_val=None, y_val=None, epochs=100, verbose=10, batch_gen=batch_sequential, batch_size=None):
	x_val, y_val = (x_train, y_train) if (x_val is None or y_val is None) else (x_val, y_val)

	for epoch in range(epochs+1):
	self.learning_rate = self.lr_decay_method(self.lr_initial, epoch, self.lr_decay_rate, self.lr_decay_steps)

	for x_batch, y_batch in batch_gen(x_train, y_train, batch_size):
	y_pred = self.__feedforward(x_batch)
	self.__backprop(y_batch, y_pred)

	loss_val = self.cost_func(y_val, self.predict(x_val))
	if loss_val < self._best_loss:
	self._best_model, self._best_loss = self.layers, loss_val
	self.waiting = 0
	else:
	self.waiting += 1
	if self.waiting >= self.patience:
	self.layers = self._best_model
	return

	if epoch % verbose == 0:
	loss_train = self.cost_func(y_train, self.predict(x_train))
	loss_reg = (1.0/y_train.shape[0])np.sum([layer.reg_strength layer.reg_func(layer.weights) for layer in self.layers])
	print("epoch: {0:=4}/{1} loss_train: {2:.8f} + {3:.8f} = {4:.8f} loss_val = {5:.8f}".format(epoch, epochs, loss_train, loss_reg, loss_train + loss_reg, loss_val))

	def predict(self, x):
	return self.__feedforward(x, is_training=False)

	def save(self, file_path):
	pkl.dump(self, open(file_path, 'wb'), -1)

	def load(file_path):
	return pkl.load(open(file_path, 'rb'))

	def __feedforward(self, x, is_training=True):
	self.layers[0].input = x
	for current_layer, next_layer in zip(self.layers, self.layers[1:] + [Layer(0, 0)]):
	y = np.dot(current_layer.input, current_layer.weights.T) + current_layer.biases
	y = batchnorm_forward(current_layer, y, is_training) if current_layer.batch_norm else y
	current_layer._dropout_mask = np.random.binomial(1, 1.0-current_layer.dropout_prob, y.shape) / (1.0-current_layer.dropout_prob)
	current_layer._activ_inp = y
	current_layer._activ_out = current_layer.activation(y) * (current_layer._dropout_mask if is_training else 1.0)
	next_layer.input = current_layer._activ_out
	return self.layers[-1]._activ_out

	def __backprop(self, y, y_pred):
	last_delta = self.cost_func(y, y_pred, derivative=True)
	for layer in reversed(self.layers):
	dactivation = layer.activation(layer._activ_inp, derivative=True) * last_delta * layer._dropout_mask
	dactivation = batchnorm_backward(layer, dactivation) if layer.batch_norm else dactivation
	last_delta = np.dot(dactivation, layer.weights)
	layer._dweights = np.dot(dactivation.T, layer.input)
	layer._dbiases = 1.0*dactivation.sum(axis=0, keepdims=True)

	for layer in reversed(self.layers):
	if layer.is_trainable:
	layer._dweights = layer._dweights + (1.0/y.shape[0]) * layer.reg_strength * layer.reg_func(layer.weights, derivative=True)
	layer._prev_dweights = -self.learning_ratelayer._dweights + self.momentumlayer._prev_dweights
	layer.weights = layer.weights + layer._prev_dweights
	layer.biases = layer.biases - self.learning_rate*layer._dbiases
	if layer.batch_norm:
	layer.gamma = layer.gamma - self.learning_rate*layer._dgamma
	layer.beta = layer.beta - self.learning_rate*layer._dbeta

	# example 1
	print("------------ example 1 ------------")
	x = np.array([[0.05, 0.10]])
	y = np.array([[0.01, 0.99]])

	D_in, D_out = x.shape[1], y.shape[1]
	nn = NeuralNetwork(cost_func=mse, learning_rate=0.5)
	nn.layers.append(Layer(input_dim=D_in, output_dim=2, activation=sigmoid))
	nn.layers.append(Layer(input_dim=2, output_dim=D_out, activation=sigmoid))

	w1 = np.array([[0.15, 0.20], [0.25, 0.30]])
	b1 = np.array([[0.35]]) # destacar que eram para ser 2 bias - um para cada neurônio (1, 2)
	w2 = np.array([[0.40, 0.45], [0.50, 0.55]])
	b2 = np.array([[0.60]]) # destacar que eram para ser 2 bias - um para cada neurônio (1, 2)

	nn.layers[0].weights = w1
	nn.layers[0].biases = b1
	nn.layers[1].weights = w2
	nn.layers[1].biases = b2

	nn.fit(x, y, epochs=0, verbose=1)

	for layer in nn.layers:
	print(layer.weights)

	# example 2
	print()
	print("------------ example 2 ------------")
	x = np.array([[0.1, 0.2, 0.7]])
	y = np.array([[1, 0, 0]])
	D_in, D_out = x.shape[1], y.shape[1]

	nn = NeuralNetwork(cost_func=softmax_neg_log_likelihood, learning_rate=0.01)
	nn.layers.append(Layer(input_dim=D_in, output_dim=3, activation=relu))
	nn.layers.append(Layer(input_dim=3, output_dim=3, activation=sigmoid))
	nn.layers.append(Layer(input_dim=3, output_dim=D_out, activation=linear))

	w1 = np.array([[0.1, 0.2, 0.3], [0.3, 0.2, 0.7], [0.4, 0.3, 0.9]])
	b1 = np.ones((1,3))
	w2 = np.array([[0.2, 0.3, 0.5], [0.3, 0.5, 0.7], [0.6, 0.4, 0.8]])
	b2 = np.ones((1,3))
	w3 = np.array([[0.1, 0.4, 0.8], [0.3, 0.7, 0.2], [0.5, 0.2, 0.9]])
	b3 = np.ones((1,3))

	for i, w, b in zip(range(3), [w1, w2, w3], [b1, b2, b3]):
	nn.layers[i].weights = w
	nn.layers[i].biases = b

	nn.fit(x, y, epochs=300, verbose=30)
	for layer in nn.layers:
	print(layer.weights)

	nn.save('model.pkl')

	# restart notebook and create new cell
	nn = NeuralNetwork.load('model.pkl')
	for layer in nn.layers:
	print(layer.weights)

	# gradient checking
	print()
	print("------------ grad. check ------------")
	np.random.seed(1234)
	N, D = 100, 2
	x = np.random.rand(N, D)
	y = np.random.rand(N, 1)

	# regression
	D_in, D_out = x.shape[1], y.shape[1]

	nn = NeuralNetwork(cost_func=mse, learning_rate=1e-3, momentum=0.9, lr_decay_method=staircase_decay, lr_decay_rate=0.5, lr_decay_steps=10)
	nn.layers.append(Layer(input_dim=D_in, output_dim=4, activation=relu, reg_func=l2_regularization, reg_strength=1.0, batch_norm=True))
	nn.layers.append(Layer(input_dim=4, output_dim=1, activation=tanh, reg_func=l1_regularization, reg_strength=1e-4))
	nn.layers.append(Layer(input_dim=1, output_dim=2, activation=sigmoid, reg_func=l1_regularization,reg_strength=1.0, batch_norm=True))
	nn.layers.append(Layer(input_dim=2, output_dim=5, activation=leaky_relu, reg_func=l2_regularization, reg_strength=1e-2))
	nn.layers.append(Layer(input_dim=5, output_dim=3, activation=elu, reg_func=l1_regularization, reg_strength=1e-3, batch_norm=True))
	nn.layers.append(Layer(input_dim=3, output_dim=D_out, activation=linear, reg_func=l2_regularization, reg_strength=1e-3, batch_norm=True))

	nn.fit(x, y, epochs=100)
	gradient_checking(nn, x, y, eps=1e-4, verbose=True)

	# binary classification
	y = np.random.randint(0, 2, (N, 1))
	D_in, D_out = x.shape[1], y.shape[1]

	nn = NeuralNetwork(cost_func=sigmoid_cross_entropy, learning_rate=1e-3, momentum=0.9, lr_decay_method=staircase_decay, lr_decay_rate=0.5, lr_decay_steps=10)
	nn.layers.append(Layer(input_dim=D_in, output_dim=4, activation=relu, reg_func=l2_regularization, reg_strength=1.0, batch_norm=True))
	nn.layers.append(Layer(input_dim=4, output_dim=1, activation=tanh, reg_func=l1_regularization, reg_strength=1e-4))
	nn.layers.append(Layer(input_dim=1, output_dim=2, activation=sigmoid, reg_func=l1_regularization,reg_strength=1.0, batch_norm=True))
	nn.layers.append(Layer(input_dim=2, output_dim=5, activation=leaky_relu, reg_func=l2_regularization, reg_strength=1e-2))
	nn.layers.append(Layer(input_dim=5, output_dim=3, activation=elu, reg_func=l1_regularization, reg_strength=1e-3, batch_norm=True))
	nn.layers.append(Layer(input_dim=3, output_dim=D_out, activation=linear, reg_func=l2_regularization, reg_strength=1e-3, batch_norm=True))

	nn.fit(x, y, epochs=100)
	gradient_checking(nn, x, y, eps=1e-4, verbose=False)

	# multiclass classification
	from sklearn.preprocessing import OneHotEncoder

	y = np.random.randint(0, 2, (N, 1))
	y_oh = OneHotEncoder(sparse=False).fit_transform(y)
	D_in, D_out = x.shape[1], y_oh.shape[1]

	nn = NeuralNetwork(cost_func=softmax_neg_log_likelihood, learning_rate=1e-3, momentum=0.9, lr_decay_method=staircase_decay, lr_decay_rate=0.5, lr_decay_steps=10)
	nn.layers.append(Layer(input_dim=D_in, output_dim=4, activation=relu, reg_func=l2_regularization, reg_strength=1.0, batch_norm=True))
	nn.layers.append(Layer(input_dim=4, output_dim=1, activation=tanh, reg_func=l1_regularization, reg_strength=1e-4))
	nn.layers.append(Layer(input_dim=1, output_dim=2, activation=sigmoid, reg_func=l1_regularization,reg_strength=1.0, batch_norm=True))
	nn.layers.append(Layer(input_dim=2, output_dim=5, activation=leaky_relu, reg_func=l2_regularization, reg_strength=1e-2))
	nn.layers.append(Layer(input_dim=5, output_dim=3, activation=elu, reg_func=l1_regularization, reg_strength=1e-3, batch_norm=True))
	nn.layers.append(Layer(input_dim=3, output_dim=D_out, activation=linear, reg_func=l2_regularization, reg_strength=1e-3, batch_norm=True))

	nn.fit(x, y, epochs=100)
	gradient_checking(nn, x, y_oh, eps=1e-4, verbose=False)